In [1]:
# GroupBy

import pandas as pd

# Sample DataFrame 
data = {'Category': ['Electronics', 'Clothing', 'Electronics', 'Clothing'],
        'Sales': [1000, 500, 800, 500]}
df = pd.DataFrame(data)

# Grouping by Column 
grouped_data = df.groupby('Category')

# Column by Column Grouping 
total_sales = grouped_data['Sales'].sum()

print(total_sales)

Category
Clothing       1000
Electronics    1800
Name: Sales, dtype: int64


In [2]:
# Sample DataFrame
data = {'Class': ['A', 'B', 'A', 'B', 'A', 'B'],
        'Gender': ['Male', 'Male', 'Female', 'Female', 'Male', 'Female'],
        'Math_Score': [85, 92, 78, 89, 90, 86],
        'English_Score': [88, 94, 80, 92, 92, 88]}
df = pd.DataFrame(data)

# Multi-Column Grouping 
grouped_data = df.groupby(['Class', 'Gender'])

# Groupings Averaged
agg_results = grouped_data['Math_Score'].mean()

print(agg_results)

Class  Gender
A      Female    78.0
       Male      87.5
B      Female    87.5
       Male      92.0
Name: Math_Score, dtype: float64


In [3]:
# All Groupings Averaged

grouped_data = df.groupby(['Class', 'Gender'])

# Applies Mean to All Columns
aggregated_data = grouped_data.mean()

print(aggregated_data)

              Math_Score  English_Score
Class Gender                           
A     Female        78.0           80.0
      Male          87.5           90.0
B     Female        87.5           90.0
      Male          92.0           94.0


In [4]:
# Muti Agg 

data = {'Class': ['A', 'B', 'A', 'B', 'A', 'B'],
        'Gender': ['Male', 'Male', 'Female', 'Female', 'Male', 'Female'],
        'Math_Score': [85, 92, 78, 89, 90, 86],
        'English_Score': [88, 94, 80, 92, 92, 88],
        'Physics_Score': [78, 90, 85, 92, 88, 84]}
df = pd.DataFrame(data)

grouped_data = df.groupby(['Class', 'Gender'])

# Multi Agg
agg_results = grouped_data.Math_Score.agg(['mean', 'min', 'max'])

print(agg_results)

              mean  min  max
Class Gender                
A     Female  78.0   78   78
      Male    87.5   85   90
B     Female  87.5   86   89
      Male    92.0   92   92


In [9]:
# Multi Agg for Columns
aggregated_data = grouped_data.agg({
    'Math_Score': ['mean', 'min', 'max'],
    'Physics_Score': ['mean', 'min', 'max']})

print(aggregated_data)

             Math_Score         Physics_Score        
                   mean min max          mean min max
Class Gender                                         
A     Female       78.0  78  78          85.0  85  85
      Male         87.5  85  90          83.0  78  88
B     Female       87.5  86  89          88.0  84  92
      Male         92.0  92  92          90.0  90  90


In [16]:
# Sample DataFrame 
data = {'Category': ['Electronics', 'Clothing', 'Electronics', 'Clothing'],
        'Region': ['North', 'South', 'North', 'South'],
        'Sales': [1000, 500, 800, 750],
        'Profit': [150, 50, 120, 100]}
df = pd.DataFrame(data)

# Pivot Table: Sum of Sales by Category and Region
pivot_table = pd.pivot_table(df, index = 'Category', columns = 'Region', values = 'Sales', aggfunc ='sum')

# Name Pivot Table
print("Pivot Table:")
print(pivot_table)

Pivot Table:
Region        North   South
Category                   
Clothing        NaN  1250.0
Electronics  1800.0     NaN


In [12]:
# Cross-Tabulation: Count of Category by Region
cross_tab = pd.crosstab(df['Category'], df['Region'])

print("\nCross-Tabulation:")
print(cross_tab)


Cross-Tabulation:
Region       North  South
Category                 
Clothing         0      2
Electronics      2      0


In [13]:
# DateTime Conversion 

# Sample DataFrame with a DateTime column
data = {'DateTime': ['2023-01-01 08:30:00', '2023-02-01 14:45:00', '2023-03-01 20:15:00']}
df = pd.DataFrame(data)

# Convert the 'DateTime' column to DateTime
df['DateTime'] = pd.to_datetime(df['DateTime'])

# Extract year, month, day, and hour
df['Year'] = df['DateTime'].dt.year
df['Month'] = df['DateTime'].dt.month
df['Day'] = df['DateTime'].dt.day
df['Hour'] = df['DateTime'].dt.hour

print(df)

             DateTime  Year  Month  Day  Hour
0 2023-01-01 08:30:00  2023      1    1     8
1 2023-02-01 14:45:00  2023      2    1    14
2 2023-03-01 20:15:00  2023      3    1    20


In [17]:
# Sample DataFrame with daily sales data
data = {'Date': pd.date_range(start='2023-01-01', periods = 40, freq ='D'),
        'Sales': [i for i in range(40)]}
df = pd.DataFrame(data)

# Resample Monthly Frequency
monthly_sales = df.resample('M', on ='Date').sum()

print(monthly_sales)

            Sales
Date             
2023-01-31    465
2023-02-28    315


In [18]:
# Shifting 

# Sample DataFrame with Daily Stock Prices
data = {'Date': pd.date_range(start='2023-01-01', periods = 5, freq = 'D'),
        'Price': [100, 105, 110, 108, 112]}
df = pd.DataFrame(data)

# Calculate Price Changes (Interval of 1 day)
df['Price_Change'] = df['Price'] - df['Price'].shift(1)
df

Unnamed: 0,Date,Price,Price_Change
0,2023-01-01,100,
1,2023-01-02,105,5.0
2,2023-01-03,110,5.0
3,2023-01-04,108,-2.0
4,2023-01-05,112,4.0


In [20]:
# Ordinal Sorting 

# Sample DataFrame with Ordinal Column
data = {'Product': ['Product A', 'Product B', 'Product C', 'Product D'],
        'Size': ['Medium', 'Small', 'Large', 'Medium']}
df = pd.DataFrame(data)

# Define Custom Ordinal Order
ordinal_order = ['Small', 'Medium', 'Large']

# Before Sorting
print('Noraml Sorting:')
print(df.sort_values(by='Size'))

# After Sorting
df['Size'] = pd.Categorical(df['Size'], categories = ordinal_order, ordered = True)
print('Ordinal Sorting:')
print(df.sort_values(by = 'Size'))

Noraml Sorting:
     Product    Size
2  Product C   Large
0  Product A  Medium
3  Product D  Medium
1  Product B   Small
Ordinal Sorting:
     Product    Size
1  Product B   Small
0  Product A  Medium
3  Product D  Medium
2  Product C   Large


In [23]:
# Eval

# Example DataFrame
data = {'A': [1, 2, 3, 4],
        'B': [10, 20, 30, 40]}
df = pd.DataFrame(data)

# Create New C
df.eval('C = A + B', inplace=True)

# Create New D
df.eval('D = (A * 2) + (B / 3)', inplace=True)

print(df.C)
print(df.D)

0    11
1    22
2    33
3    44
Name: C, dtype: int64
0     5.333333
1    10.666667
2    16.000000
3    21.333333
Name: D, dtype: float64


In [25]:
# Query

data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'Salary': [50000, 60000, 70000, 80000]}
df = pd.DataFrame(data)

# Single Query 
filtered_df = df.query('Age > 30')

# Multi Query
filtered_df = df.query('(Age > 30) and (Salary > 60000)')

print(filtered_df)

      Name  Age  Salary
2  Charlie   35   70000
3    David   40   80000


In [27]:
# Sample Hierarchical DataFrame
data = {'Department': ['HR', 'HR', 'Engineering', 'Engineering'],
        'Employee': ['Alice', 'Bob', 'Charlie', 'David'],
        'Salary': [60_000, 65_000, 80_000, 75_000]}
df = pd.DataFrame(data)

# Create a Hierarchical Index
hierarchical_df = df.set_index(['Department', 'Employee'])

print(hierarchical_df.loc[('HR','Bob')])  

Salary    65000
Name: (HR, Bob), dtype: int64
