In [1]:
# %conda install pandas -y

In [2]:
# %conda install prettytable

In [1]:
from prettytable import PrettyTable
import pandas as pd
import numpy as np

In [4]:
import pandas as pd
from prettytable import PrettyTable

def table(input_dataframe):
    """
    This function takes a Pandas DataFrame or Series and displays it as a PrettyTable.
    It also supports multi-index DataFrames and groupby operations with aggregated columns.
    """
    # make a deep copy 
    df_or_series = input_dataframe.copy()
    # Handle if input is a Pandas Series
    if isinstance(df_or_series, pd.Series):
        series_name = df_or_series.index.name
        df_or_series = df_or_series.to_frame().reset_index()
        # Use the Series name as the column name for the values column
        df_or_series.columns = [series_name, 'value']  # Dynamic column name

    # Handle multi-index DataFrame from groupby aggregation
    elif isinstance(df_or_series, pd.DataFrame):
        if isinstance(df_or_series.columns, pd.MultiIndex):
            # Flatten multi-level columns
            df_or_series.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in df_or_series.columns]
            df_or_series.reset_index(inplace=True)

    # Create PrettyTable instance
    table = PrettyTable()

    # Set the column names (field names) to match the DataFrame columns
    table.field_names = df_or_series.columns.tolist()

    # Add rows from the DataFrame or Series to the PrettyTable
    for row in df_or_series.itertuples(index=False):
        table.add_row(row)

    # Print the table
    print(table)


In [5]:
df = pd.read_csv('sample_datasets/blackfriday.csv')

In [6]:
df.shape

(537577, 12)

In [7]:
print(df.columns)

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')


In [8]:
df.columns = [i.lower().replace(' ','_') for i in df.columns]
print(df.columns)

Index(['user_id', 'product_id', 'gender', 'age', 'occupation', 'city_category',
       'stay_in_current_city_years', 'marital_status', 'product_category_1',
       'product_category_2', 'product_category_3', 'purchase'],
      dtype='object')


In [26]:
table(df[['user_id','product_id','purchase','occupation','product_category_1','product_category_2','product_category_3']].head())

+---------+------------+----------+------------+--------------------+--------------------+--------------------+
| user_id | product_id | purchase | occupation | product_category_1 | product_category_2 | product_category_3 |
+---------+------------+----------+------------+--------------------+--------------------+--------------------+
| 1000001 | P00069042  |   8370   |     10     |         3          |        nan         |        nan         |
| 1000001 | P00248942  |  15200   |     10     |         1          |        6.0         |        14.0        |
| 1000001 | P00087842  |   1422   |     10     |         12         |        nan         |        nan         |
| 1000001 | P00085442  |   1057   |     10     |         12         |        14.0        |        nan         |
| 1000002 | P00285442  |   7969   |     16     |         8          |        nan         |        nan         |
+---------+------------+----------+------------+--------------------+--------------------+--------------

In [14]:
df.city_category.value_counts().index.name

'city_category'

In [25]:
table(df.city_category.value_counts())

+---------------+--------+
| city_category | value  |
+---------------+--------+
|       B       | 226493 |
|       C       | 166446 |
|       A       | 144638 |
+---------------+--------+


In [24]:
table(df.occupation.value_counts())

+------------+-------+
| occupation | value |
+------------+-------+
|     4      | 70862 |
|     0      | 68120 |
|     7      | 57806 |
|     1      | 45971 |
|     17     | 39090 |
|     20     | 32910 |
|     12     | 30423 |
|     14     | 26712 |
|     2      | 25845 |
|     16     | 24790 |
|     6      | 19822 |
|     3      | 17366 |
|     10     | 12623 |
|     5      | 11985 |
|     15     | 11812 |
|     11     | 11338 |
|     19     |  8352 |
|     13     |  7548 |
|     18     |  6525 |
|     9      |  6153 |
|     8      |  1524 |
+------------+-------+


In [23]:
group_df = df.groupby(['city_category','occupation']).agg({'purchase':['sum','mean','count']})
table(group_df)

+---------------+------------+--------------+--------------------+----------------+
| city_category | occupation | purchase_sum |   purchase_mean    | purchase_count |
+---------------+------------+--------------+--------------------+----------------+
|       A       |     0      |  164080740   |  8919.37051532942  |     18396      |
|       A       |     1      |  108803572   | 8778.021137555466  |     12395      |
|       A       |     2      |   76427056   |  8588.27463759973  |      8899      |
|       A       |     3      |   49011902   | 8821.436645068394  |      5556      |
|       A       |     4      |  214506091   | 8989.443089430893  |     23862      |
|       A       |     5      |   20966920   | 8937.306052855925  |      2346      |
|       A       |     6      |   34156542   | 9301.890522875818  |      3672      |
|       A       |     7      |  140383356   | 8883.897987596507  |     15802      |
|       A       |     8      |   1144331    |      11443.31      |      100 

In [20]:
# for each city category : sort the purchase value in descending order 
sorted_grouped = group_df.sort_values(['city_category',('purchase','sum')], ascending=[True, False])
table(sorted_grouped)

+---------------+------------+--------------+--------------------+----------------+
| city_category | occupation | purchase_sum |   purchase_mean    | purchase_count |
+---------------+------------+--------------+--------------------+----------------+
|       A       |     4      |  214506091   | 8989.443089430893  |     23862      |
|       A       |     0      |  164080740   |  8919.37051532942  |     18396      |
|       A       |     7      |  140383356   | 8883.897987596507  |     15802      |
|       A       |     1      |  108803572   | 8778.021137555466  |     12395      |
|       A       |     20     |  105500948   | 8361.146615945474  |     12618      |
|       A       |     2      |   76427056   |  8588.27463759973  |      8899      |
|       A       |     17     |   73597023   | 9140.216467958271  |      8052      |
|       A       |     14     |   72267923   | 9482.734942920877  |      7621      |
|       A       |     12     |   65733668   |  9496.34036405663  |      6922

In [22]:
# Step 3: Extract the second largest purchase for each City_Category
result = (
    sorted_grouped.groupby('city_category')
    .nth(1)  # Get the second row (index 1) in each group
    .reset_index()
)
table(result)

+-------+----------------+-------------+--------------+-------------------+----------------+
| index | city_category_ | occupation_ | purchase_sum |   purchase_mean   | purchase_count |
+-------+----------------+-------------+--------------+-------------------+----------------+
|   0   |       A        |      0      |  164080740   |  8919.37051532942 |     18396      |
|   1   |       B        |      0      |  266483416   | 9077.647363401009 |     29356      |
|   2   |       C        |      0      |  195250655   | 9586.147633542812 |     20368      |
+-------+----------------+-------------+--------------+-------------------+----------------+


In [27]:
import pandas as pd

# Expanded Sample Dataset
data = {
    'City': [
        'California', 'California', 'California', 'California', 'New York', 'New York',
        'Texas', 'Texas', 'Texas', 'Texas', 'California', 'California', 'New York',
        'New York', 'Texas', 'Texas'
    ],
    'Employee': [
        'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 
        'E7', 'E8', 'E9', 'E10', 'E11', 'E12', 'E13',
        'E14', 'E15', 'E16'
    ],
    'Job_Title': [
        'Data Scientist', 'Data Scientist', 'Data Scientist', 'Data Scientist', 
        'Data Scientist', 'Data Scientist', 'Data Scientist', 'Data Scientist', 
        'Software Engineer', 'Software Engineer', 'Software Engineer', 
        'Software Engineer', 'Manager', 'Manager', 'Manager', 'Manager'
    ],
    'Salary': [
        120000, 120000, 110000, 100000, 130000, 125000, 
        115000, 115000, 140000, 135000, 150000, 145000, 160000, 155000, 90000, 85000
    ]
}

df = pd.DataFrame(data)

# Assign ranks within each City and Job_Title based on Salary in descending order
df['Rank'] = (
    df.groupby(['City', 'Job_Title'])['Salary']
    .rank(ascending=False, method='dense')  # Dense ranking: same salary gets same rank
)

table(df)

+------------+----------+-------------------+--------+------+
|    City    | Employee |     Job_Title     | Salary | Rank |
+------------+----------+-------------------+--------+------+
| California |    E1    |   Data Scientist  | 120000 | 1.0  |
| California |    E2    |   Data Scientist  | 120000 | 1.0  |
| California |    E3    |   Data Scientist  | 110000 | 2.0  |
| California |    E4    |   Data Scientist  | 100000 | 3.0  |
|  New York  |    E5    |   Data Scientist  | 130000 | 1.0  |
|  New York  |    E6    |   Data Scientist  | 125000 | 2.0  |
|   Texas    |    E7    |   Data Scientist  | 115000 | 1.0  |
|   Texas    |    E8    |   Data Scientist  | 115000 | 1.0  |
|   Texas    |    E9    | Software Engineer | 140000 | 1.0  |
|   Texas    |   E10    | Software Engineer | 135000 | 2.0  |
| California |   E11    | Software Engineer | 150000 | 1.0  |
| California |   E12    | Software Engineer | 145000 | 2.0  |
|  New York  |   E13    |      Manager      | 160000 | 1.0  |
|  New Y