In [3]:
# %conda install pandas -y

In [4]:
# %conda install prettytable

In [3]:
from prettytable import PrettyTable
import pandas as pd
import numpy as np

## Table Function

In [4]:
import pandas as pd
from prettytable import PrettyTable

def table(input_dataframe):
    """
    This function takes a Pandas DataFrame or Series and displays it as a PrettyTable.
    It also supports multi-index DataFrames and groupby operations with aggregated columns.
    Index values are retained and displayed in the table.
    """
    from prettytable import PrettyTable
    import pandas as pd

    # Make a deep copy of the input DataFrame or Series
    df_or_series = input_dataframe.copy()

    # If the input is a Pandas Series
    if isinstance(df_or_series, pd.Series):
        index_name = df_or_series.index.name or 'index'  # Default to 'index' if index has no name
        series_name = df_or_series.name or 'value'  # Default to 'value' if Series has no name
        df_or_series = df_or_series.reset_index()  # Reset the index
        df_or_series.columns = [index_name, series_name]  # Use dynamic column names

    # If the input is a DataFrame
    elif isinstance(df_or_series, pd.DataFrame):
        # If the DataFrame has a multi-index
        if isinstance(df_or_series.index, pd.MultiIndex):
            df_or_series.reset_index(inplace=True)
        else:
            # Convert the index to a column if not already part of the DataFrame
            df_or_series.reset_index(inplace=True)

    # Create a PrettyTable instance
    table = PrettyTable()

    # Set the column names (field names) to match the DataFrame columns
    table.field_names = df_or_series.columns.tolist()

    # Add rows from the DataFrame to the PrettyTable
    for row in df_or_series.itertuples(index=False):
        table.add_row(row)

    # Print the PrettyTable
    print(table)

In [30]:
df = pd.read_csv('sample_datasets/blackfriday.csv')

In [31]:
df.shape

(537577, 12)

In [32]:
print(df.columns)

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')


In [33]:
df.columns = [i.lower().replace(' ','_') for i in df.columns]
print(df.columns)

Index(['user_id', 'product_id', 'gender', 'age', 'occupation', 'city_category',
       'stay_in_current_city_years', 'marital_status', 'product_category_1',
       'product_category_2', 'product_category_3', 'purchase'],
      dtype='object')


In [34]:
table(df[['user_id','product_id','purchase','occupation','product_category_1','product_category_2','product_category_3']].head())

+-------+---------+------------+----------+------------+--------------------+--------------------+--------------------+
| index | user_id | product_id | purchase | occupation | product_category_1 | product_category_2 | product_category_3 |
+-------+---------+------------+----------+------------+--------------------+--------------------+--------------------+
|   0   | 1000001 | P00069042  |   8370   |     10     |         3          |        nan         |        nan         |
|   1   | 1000001 | P00248942  |  15200   |     10     |         1          |        6.0         |        14.0        |
|   2   | 1000001 | P00087842  |   1422   |     10     |         12         |        nan         |        nan         |
|   3   | 1000001 | P00085442  |   1057   |     10     |         12         |        14.0        |        nan         |
|   4   | 1000002 | P00285442  |   7969   |     16     |         8          |        nan         |        nan         |
+-------+---------+------------+--------

In [35]:
df.city_category.value_counts().index.name

'city_category'

In [36]:
table(df.city_category.value_counts())

+---------------+--------+
| city_category | count  |
+---------------+--------+
|       B       | 226493 |
|       C       | 166446 |
|       A       | 144638 |
+---------------+--------+


In [37]:
table(df.occupation.value_counts())

+------------+-------+
| occupation | count |
+------------+-------+
|     4      | 70862 |
|     0      | 68120 |
|     7      | 57806 |
|     1      | 45971 |
|     17     | 39090 |
|     20     | 32910 |
|     12     | 30423 |
|     14     | 26712 |
|     2      | 25845 |
|     16     | 24790 |
|     6      | 19822 |
|     3      | 17366 |
|     10     | 12623 |
|     5      | 11985 |
|     15     | 11812 |
|     11     | 11338 |
|     19     |  8352 |
|     13     |  7548 |
|     18     |  6525 |
|     9      |  6153 |
|     8      |  1524 |
+------------+-------+


## Multi Grouping - aggregation/sorting

In [38]:
group_df = df.groupby(['city_category','occupation']).agg({'purchase':['sum','mean','count']})
table(group_df)

+-----------------------+--------------------+---------------------+----------------------+-----------------------+
| ('city_category', '') | ('occupation', '') | ('purchase', 'sum') | ('purchase', 'mean') | ('purchase', 'count') |
+-----------------------+--------------------+---------------------+----------------------+-----------------------+
|           A           |         0          |      164080740      |   8919.37051532942   |         18396         |
|           A           |         1          |      108803572      |  8778.021137555466   |         12395         |
|           A           |         2          |       76427056      |   8588.27463759973   |          8899         |
|           A           |         3          |       49011902      |  8821.436645068394   |          5556         |
|           A           |         4          |      214506091      |  8989.443089430893   |         23862         |
|           A           |         5          |       20966920      |  89

In [39]:
# for each city category : sort the purchase value in descending order 
sorted_grouped = group_df.sort_values(['city_category',('purchase','sum')], ascending=[True, False])
table(sorted_grouped)

+-----------------------+--------------------+---------------------+----------------------+-----------------------+
| ('city_category', '') | ('occupation', '') | ('purchase', 'sum') | ('purchase', 'mean') | ('purchase', 'count') |
+-----------------------+--------------------+---------------------+----------------------+-----------------------+
|           A           |         4          |      214506091      |  8989.443089430893   |         23862         |
|           A           |         0          |      164080740      |   8919.37051532942   |         18396         |
|           A           |         7          |      140383356      |  8883.897987596507   |         15802         |
|           A           |         1          |      108803572      |  8778.021137555466   |         12395         |
|           A           |         20         |      105500948      |  8361.146615945474   |         12618         |
|           A           |         2          |       76427056      |   8

## Fetching the nth item after groupby/orderby 

In [46]:
# Step 3: Extract the second largest purchase for each City_Category
result = (
    sorted_grouped.groupby('city_category')
    .nth(1)  # Get the second row (index 1) in each group
    .reset_index()
)
table(result)

+---------------+-----------------------+--------------------+---------------------+----------------------+-----------------------+
| ('index', '') | ('city_category', '') | ('occupation', '') | ('purchase', 'sum') | ('purchase', 'mean') | ('purchase', 'count') |
+---------------+-----------------------+--------------------+---------------------+----------------------+-----------------------+
|       0       |           A           |         0          |      164080740      |   8919.37051532942   |         18396         |
|       1       |           B           |         0          |      266483416      |  9077.647363401009   |         29356         |
|       2       |           C           |         0          |      195250655      |  9586.147633542812   |         20368         |
+---------------+-----------------------+--------------------+---------------------+----------------------+-----------------------+


## Groupby orderby - assigning ranks

In [41]:
import pandas as pd

# Expanded Sample Dataset
data = {
    'City': [
        'California', 'California', 'California', 'California', 'New York', 'New York',
        'Texas', 'Texas', 'Texas', 'Texas', 'California', 'California', 'New York',
        'New York', 'Texas', 'Texas'
    ],
    'Employee': [
        'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 
        'E7', 'E8', 'E9', 'E10', 'E11', 'E12', 'E13',
        'E14', 'E15', 'E16'
    ],
    'Job_Title': [
        'Data Scientist', 'Data Scientist', 'Data Scientist', 'Data Scientist', 
        'Data Scientist', 'Data Scientist', 'Data Scientist', 'Data Scientist', 
        'Software Engineer', 'Software Engineer', 'Software Engineer', 
        'Software Engineer', 'Manager', 'Manager', 'Manager', 'Manager'
    ],
    'Salary': [
        120000, 120000, 110000, 100000, 130000, 125000, 
        115000, 115000, 140000, 135000, 150000, 145000, 160000, 155000, 90000, 85000
    ]
}

df = pd.DataFrame(data)

# Assign ranks within each City and Job_Title based on Salary in descending order
df['Rank'] = (
    df.groupby(['City', 'Job_Title'])['Salary']
    .rank(ascending=False, method='dense')  # Dense ranking: same salary gets same rank
)

table(df)

+-------+------------+----------+-------------------+--------+------+
| index |    City    | Employee |     Job_Title     | Salary | Rank |
+-------+------------+----------+-------------------+--------+------+
|   0   | California |    E1    |   Data Scientist  | 120000 | 1.0  |
|   1   | California |    E2    |   Data Scientist  | 120000 | 1.0  |
|   2   | California |    E3    |   Data Scientist  | 110000 | 2.0  |
|   3   | California |    E4    |   Data Scientist  | 100000 | 3.0  |
|   4   |  New York  |    E5    |   Data Scientist  | 130000 | 1.0  |
|   5   |  New York  |    E6    |   Data Scientist  | 125000 | 2.0  |
|   6   |   Texas    |    E7    |   Data Scientist  | 115000 | 1.0  |
|   7   |   Texas    |    E8    |   Data Scientist  | 115000 | 1.0  |
|   8   |   Texas    |    E9    | Software Engineer | 140000 | 1.0  |
|   9   |   Texas    |   E10    | Software Engineer | 135000 | 2.0  |
|   10  | California |   E11    | Software Engineer | 150000 | 1.0  |
|   11  | California

In [42]:
# data types 
df.dtypes

City          object
Employee      object
Job_Title     object
Salary         int64
Rank         float64
dtype: object

## Using groupby and apply together

In [44]:
# Sample DataFrame
data = {
    'customer_id': [101, 101, 101, 102, 102, 103, 103, 103, 103],
    'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'order_timestamp': [
        '2023-01-01', '2023-01-15', '2023-02-01',
        '2023-01-10', '2023-01-25',
        '2023-01-05', '2023-01-20', '2023-02-01', '2023-02-15'
    ],
    'order_value': [100, 200, 150, 250, 300, 400, 100, 50, 75],
    'product_category': [
        'Electronics', 'Electronics', 'Furniture',
        'Furniture', 'Furniture',
        'Clothing', 'Clothing', 'Accessories', 'Clothing'
    ]
}

df = pd.DataFrame(data)
# Convert 'order_timestamp' to datetime
df['order_timestamp'] = pd.to_datetime(df['order_timestamp'])

# Define a function for group-level calculations
def customer_summary(group):
    total_orders = len(group)
    avg_order_value = group['order_value'].mean()
    most_common_category = group['product_category'].mode().iloc[0]  # Mode can have ties, take the first one
    first_order = group.sort_values('order_timestamp').iloc[0]['order_value']
    last_order = group.sort_values('order_timestamp').iloc[-1]['order_value']
    pct_change = (last_order - first_order) / first_order if first_order != 0 else np.nan

    return pd.Series({
        'total_orders': total_orders,
        'avg_order_value': avg_order_value,
        'most_common_category': most_common_category,
        'pct_change_order_value': pct_change
    })

# Apply the custom function to each group
customer_stats = df.groupby('customer_id').apply(customer_summary,include_groups=False)

# Normalize the numeric columns to a [0, 1] range
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

numeric_cols = ['total_orders', 'avg_order_value', 'pct_change_order_value']
customer_stats[numeric_cols] = customer_stats[numeric_cols].apply(normalize)
table(customer_stats)

+-------------+--------------+-----------------+----------------------+------------------------+
| customer_id | total_orders | avg_order_value | most_common_category | pct_change_order_value |
+-------------+--------------+-----------------+----------------------+------------------------+
|     101     |     0.5      |       0.0       |     Electronics      |          1.0           |
|     102     |     0.0      |       1.0       |      Furniture       |   0.7714285714285714   |
|     103     |     1.0      |       0.05      |       Clothing       |          0.0           |
+-------------+--------------+-----------------+----------------------+------------------------+


## Pivot Table Usage

In [47]:
# Sample dataset
data = {
    'Region': ['North', 'South', 'North', 'East', 'West', 'East', 'South', 'West', 'North', 'South'],
    'Product': ['A', 'B', 'C', 'A', 'C', 'B', 'C', 'A', 'B', 'C'],
    'Date': [
        '2023-01-15', '2023-02-20', '2023-03-10', '2023-04-05', '2023-05-25',
        '2023-06-15', '2023-07-10', '2023-08-20', '2023-09-15', '2023-10-10'
    ],
    'Sales': [500, 700, 200, 900, 300, 400, 600, 800, 1000, 750],
    'Profit': [50, 100, 30, 120, 40, 60, 90, 110, 150, 100]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Add a new column for quarter and year
df['Year'] = df['Date'].dt.year
df['Quarter'] = df['Date'].dt.to_period('Q')  # Example: '2023Q1'

In [48]:
table(df)

+-------+--------+---------+---------------------+-------+--------+------+---------+
| index | Region | Product |         Date        | Sales | Profit | Year | Quarter |
+-------+--------+---------+---------------------+-------+--------+------+---------+
|   0   | North  |    A    | 2023-01-15 00:00:00 |  500  |   50   | 2023 |  2023Q1 |
|   1   | South  |    B    | 2023-02-20 00:00:00 |  700  |  100   | 2023 |  2023Q1 |
|   2   | North  |    C    | 2023-03-10 00:00:00 |  200  |   30   | 2023 |  2023Q1 |
|   3   |  East  |    A    | 2023-04-05 00:00:00 |  900  |  120   | 2023 |  2023Q2 |
|   4   |  West  |    C    | 2023-05-25 00:00:00 |  300  |   40   | 2023 |  2023Q2 |
|   5   |  East  |    B    | 2023-06-15 00:00:00 |  400  |   60   | 2023 |  2023Q2 |
|   6   | South  |    C    | 2023-07-10 00:00:00 |  600  |   90   | 2023 |  2023Q3 |
|   7   |  West  |    A    | 2023-08-20 00:00:00 |  800  |  110   | 2023 |  2023Q3 |
|   8   | North  |    B    | 2023-09-15 00:00:00 |  1000 |  150  

In [70]:
# Group by Region and Product, aggregate Sales and Profit
grouped = (
    df.groupby(['Region', 'Product'])
    .agg(
        Sales_Sum=('Sales', 'sum'),
        Profit_Mean=('Profit', 'mean')
    )
    .reset_index()
)

# Pivot the grouped DataFrame
pivot_table = grouped.pivot(
    index=['Product'],
    columns=['Region'],
    values=['Sales_Sum', 'Profit_Mean']
)

# Flatten the multi-level columns
table(pivot_table)

+-----------------+-----------------------+------------------------+------------------------+-----------------------+-------------------------+--------------------------+--------------------------+-------------------------+
| ('Product', '') | ('Sales_Sum', 'East') | ('Sales_Sum', 'North') | ('Sales_Sum', 'South') | ('Sales_Sum', 'West') | ('Profit_Mean', 'East') | ('Profit_Mean', 'North') | ('Profit_Mean', 'South') | ('Profit_Mean', 'West') |
+-----------------+-----------------------+------------------------+------------------------+-----------------------+-------------------------+--------------------------+--------------------------+-------------------------+
|        A        |         900.0         |         500.0          |          nan           |         800.0         |          120.0          |           50.0           |           nan            |          110.0          |
|        B        |         400.0         |         1000.0         |         700.0          |          n

In [13]:
import pandas as pd

data = {
    'Region': ['North', 'North', 'South', 'South', 'East', 'East'],
    'Product': ['A', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 200, 150, 250, 300, 400],
    'Profit': [10, 20, 15, 25, 30, 40],
    'Quantity': [5, 10, 8, 12, 15, 20]
}

df = pd.DataFrame(data)
table(df)

+-------+--------+---------+-------+--------+----------+
| index | Region | Product | Sales | Profit | Quantity |
+-------+--------+---------+-------+--------+----------+
|   0   | North  |    A    |  100  |   10   |    5     |
|   1   | North  |    B    |  200  |   20   |    10    |
|   2   | South  |    A    |  150  |   15   |    8     |
|   3   | South  |    B    |  250  |   25   |    12    |
|   4   |  East  |    A    |  300  |   30   |    15    |
|   5   |  East  |    B    |  400  |   40   |    20    |
+-------+--------+---------+-------+--------+----------+


In [25]:
pivot_table

Unnamed: 0_level_0,Region,Profit,Profit,Profit,Quantity,Quantity,Quantity,Sales,Sales,Sales
Product,Unnamed: 1_level_1,A,B,Total,A,B,Total,A,B,Total
0,East,30.0,40.0,35.0,15.0,20.0,20,300.0,400.0,700
1,North,10.0,20.0,15.0,5.0,10.0,10,100.0,200.0,300
2,South,15.0,25.0,20.0,8.0,12.0,12,150.0,250.0,400
3,Total,18.333333,28.333333,23.333333,15.0,20.0,20,550.0,850.0,1400


In [27]:
df = pd.DataFrame({
      "A": ["foo", "foo", "foo", "foo", "foo","bar", "bar", "bar", "bar"],
      "B": ["one", "one", "one", "two", "two","one", "one", "two", "two"],
      "C": ["small", "large", "large", "small","small", "large", "small", "small","large"],
      "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
      "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]
})
table(df)

+-------+-----+-----+-------+---+---+
| index |  A  |  B  |   C   | D | E |
+-------+-----+-----+-------+---+---+
|   0   | foo | one | small | 1 | 2 |
|   1   | foo | one | large | 2 | 4 |
|   2   | foo | one | large | 2 | 5 |
|   3   | foo | two | small | 3 | 5 |
|   4   | foo | two | small | 3 | 6 |
|   5   | bar | one | large | 4 | 6 |
|   6   | bar | one | small | 5 | 8 |
|   7   | bar | two | small | 6 | 9 |
|   8   | bar | two | large | 7 | 9 |
+-------+-----+-----+-------+---+---+


In [26]:
table_pivot = pd.pivot_table(
    df, 
    values=['D', 'E'], 
    index=['A', 'C'],
    aggfunc={'D': "mean",'E': ["min", "max", "mean"]}
)
table_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E,E,E
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,min
A,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,large,5.5,9,7.5,6
bar,small,5.5,9,8.5,8
foo,large,2.0,5,4.5,4
foo,small,2.333333,6,4.333333,2


## apply,map

In [5]:
# Sample dataset
data = {
    'Product': ['A', 'B', 'C', 'D', 'E'],
    'Category': ['Electronics', 'Groceries', 'Clothing', 'Electronics', 'Groceries'],
    'Sales': [500, 150, 200, 800, 100],
    'Profit': [50, 20, 30, 80, 10]
}

df = pd.DataFrame(data)
table(df)

# External mapping for discounts
discounts = {'Electronics': 0.1, 'Groceries': 0.05, 'Clothing': 0.2}

# Apply a complex function row-wise to calculate adjusted sales
df['Adjusted_Sales'] = df.apply(
    lambda row: row['Sales'] * (1 - discounts.get(row['Category'], 0)) + (row['Profit'] * 0.1 if row['Profit'] > 20 else 0),
    axis=1
)

table(df)


+-------+---------+-------------+-------+--------+
| index | Product |   Category  | Sales | Profit |
+-------+---------+-------------+-------+--------+
|   0   |    A    | Electronics |  500  |   50   |
|   1   |    B    |  Groceries  |  150  |   20   |
|   2   |    C    |   Clothing  |  200  |   30   |
|   3   |    D    | Electronics |  800  |   80   |
|   4   |    E    |  Groceries  |  100  |   10   |
+-------+---------+-------------+-------+--------+
+-------+---------+-------------+-------+--------+----------------+
| index | Product |   Category  | Sales | Profit | Adjusted_Sales |
+-------+---------+-------------+-------+--------+----------------+
|   0   |    A    | Electronics |  500  |   50   |     455.0      |
|   1   |    B    |  Groceries  |  150  |   20   |     142.5      |
|   2   |    C    |   Clothing  |  200  |   30   |     163.0      |
|   3   |    D    | Electronics |  800  |   80   |     728.0      |
|   4   |    E    |  Groceries  |  100  |   10   |      95.0     

In [8]:
# Sample DataFrame
data = {
    'Math': [95, 85, 75, 65, 87],
    'Science': [90, 80, 70, 60, np.nan],
    'History': [88, 78, 68, 58, np.nan]
}

df = pd.DataFrame(data)
table(df)

# Apply a grading scale
def grade(value):
    if pd.isna(value):
        return 'N/A'
    elif value >= 90:
        return 'A'
    elif value >= 80:
        return 'B'
    elif value >= 70:
        return 'C'
    else:
        return 'D'

# Use applymap for element-wise grading
graded_df = df.map(grade)

table(graded_df)


+-------+------+---------+---------+
| index | Math | Science | History |
+-------+------+---------+---------+
|   0   |  95  |   90.0  |   88.0  |
|   1   |  85  |   80.0  |   78.0  |
|   2   |  75  |   70.0  |   68.0  |
|   3   |  65  |   60.0  |   58.0  |
|   4   |  87  |   nan   |   nan   |
+-------+------+---------+---------+
+-------+------+---------+---------+
| index | Math | Science | History |
+-------+------+---------+---------+
|   0   |  A   |    A    |    B    |
|   1   |  B   |    B    |    C    |
|   2   |  C   |    C    |    D    |
|   3   |  D   |    D    |    D    |
|   4   |  B   |   N/A   |   N/A   |
+-------+------+---------+---------+


In [11]:
# Sample dataset
data = {
    'Product': ['A', 'B', 'C', 'D'],
    'Region': ['North', 'South', 'East', 'West'],
    'Sales': [500, 300, 400, 250],
    'Profit': [50, 30, 60, 40]
}

df = pd.DataFrame(data)

# Use map to map regions to sales tax
region_tax = {'North': 0.1, 'South': 0.08, 'East': 0.05, 'West': 0.07}
df['Tax'] = df['Region'].map(region_tax)

# Use apply to calculate net sales after tax and profit adjustment
df['Net_Sales'] = df.apply(
    lambda row: row['Sales'] * (1 - row['Tax']) + row['Profit'] * 0.1,
    axis=1
)

# Use applymap to categorize all numerical values in the DataFrame
def categorize(value):
    if isinstance(value, (int, float)):
        if value > 300:
            return 'High'
        elif value > 100:
            return 'Medium'
        else:
            return 'Low'
    return value

categorized_df = df.map(categorize)

print("Original DataFrame:")
table(df)
print("\nCategorized DataFrame:")
table(categorized_df)


Original DataFrame:
+-------+---------+--------+-------+--------+------+--------------------+
| index | Product | Region | Sales | Profit | Tax  |     Net_Sales      |
+-------+---------+--------+-------+--------+------+--------------------+
|   0   |    A    | North  |  500  |   50   | 0.1  |       455.0        |
|   1   |    B    | South  |  300  |   30   | 0.08 |       279.0        |
|   2   |    C    |  East  |  400  |   60   | 0.05 |       386.0        |
|   3   |    D    |  West  |  250  |   40   | 0.07 | 236.49999999999997 |
+-------+---------+--------+-------+--------+------+--------------------+

Categorized DataFrame:
+-------+---------+--------+--------+--------+-----+-----------+
| index | Product | Region | Sales  | Profit | Tax | Net_Sales |
+-------+---------+--------+--------+--------+-----+-----------+
|   0   |    A    | North  |  High  |  Low   | Low |    High   |
|   1   |    B    | South  | Medium |  Low   | Low |   Medium  |
|   2   |    C    |  East  |  High  |  

## Grouping row values into a list 

In [12]:
import pandas as pd

# Sample DataFrame
data = {
    'Category': ['A', 'A', 'B', 'B', 'C'],
    'Values': [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)
table(df)
# Group rows by 'Category' and collect 'Values' into a list
grouped = df.groupby('Category')['Values'].agg(list).reset_index()

table(grouped)


+-------+----------+--------+
| index | Category | Values |
+-------+----------+--------+
|   0   |    A     |   1    |
|   1   |    A     |   2    |
|   2   |    B     |   3    |
|   3   |    B     |   4    |
|   4   |    C     |   5    |
+-------+----------+--------+
+-------+----------+--------+
| index | Category | Values |
+-------+----------+--------+
|   0   |    A     | [1, 2] |
|   1   |    B     | [3, 4] |
|   2   |    C     |  [5]   |
+-------+----------+--------+


## melt function 

In [28]:
import pandas as pd

# Original wide-format DataFrame
data = {
    'Product': ['A', 'B', 'C'],
    'January': [100, 150, 200],
    'February': [110, 160, 210],
    'March': [120, 170, 220],
}

df = pd.DataFrame(data)

print("Original DataFrame:")
table(df)


Original DataFrame:
+-------+---------+---------+----------+-------+
| index | Product | January | February | March |
+-------+---------+---------+----------+-------+
|   0   |    A    |   100   |   110    |  120  |
|   1   |    B    |   150   |   160    |  170  |
|   2   |    C    |   200   |   210    |  220  |
+-------+---------+---------+----------+-------+


In [29]:
# Use melt to reshape the data
melted_df = pd.melt(df, id_vars=['Product'], var_name='Month', value_name='Sales')

print("\nMelted DataFrame:")
table(melted_df)



Melted DataFrame:
+-------+---------+----------+-------+
| index | Product |  Month   | Sales |
+-------+---------+----------+-------+
|   0   |    A    | January  |  100  |
|   1   |    B    | January  |  150  |
|   2   |    C    | January  |  200  |
|   3   |    A    | February |  110  |
|   4   |    B    | February |  160  |
|   5   |    C    | February |  210  |
|   6   |    A    |  March   |  120  |
|   7   |    B    |  March   |  170  |
|   8   |    C    |  March   |  220  |
+-------+---------+----------+-------+
