In [2]:
import pandas as pd

In [3]:
# Sample DataFrame
df = pd.DataFrame({
    'department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT'],
    'salary': [60000, 45000, 65000, 70000, 50000, 62000],
    'years': [3, 2, 5, 4, 3, 2]
})

# 1. Basic grouping and aggregation
dept_stats = df.groupby('department')['salary'].mean()

dept_stats

department
Finance    70000.000000
HR         47500.000000
IT         62333.333333
Name: salary, dtype: float64

In [4]:
# 2. Multiple aggregations
dept_summary = df.groupby('department').agg({
    'salary': ['mean', 'std', 'count'],
    'years': ['mean', 'min', 'max']
})

# 3. Include/exclude NA values
dept_stats_with_na = df.groupby('department', dropna=False)['salary'].mean()
dept_stats_with_na

department
Finance    70000.000000
HR         47500.000000
IT         62333.333333
Name: salary, dtype: float64

In [5]:
# 1. Custom aggregation functions
def salary_range(x):
    return x.max() - x.min()

dept_custom = df.groupby('department').agg({
    'salary': [salary_range, 'mean'],
    'years': 'sum'
})
dept_custom

Unnamed: 0_level_0,salary,salary,years
Unnamed: 0_level_1,salary_range,mean,sum
department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Finance,0,70000.0,4
HR,5000,47500.0,5
IT,5000,62333.333333,10


In [6]:
# 2. Multiple grouping variables
multi_group = df.groupby(['department', 'years']).agg({
    'salary': ['mean', 'count']
})
multi_group

Unnamed: 0_level_0,Unnamed: 1_level_0,salary,salary
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
department,years,Unnamed: 2_level_2,Unnamed: 3_level_2
Finance,4,70000.0,1
HR,2,45000.0,1
HR,3,50000.0,1
IT,2,62000.0,1
IT,3,60000.0,1
IT,5,65000.0,1


In [7]:
# 3. Transformation (applies function to each group)
df['salary_rank'] = df.groupby('department')['salary'].rank()
df['dept_avg_salary'] = df.groupby('department')['salary'].transform('mean')
df

Unnamed: 0,department,salary,years,salary_rank,dept_avg_salary
0,IT,60000,3,1.0,62333.333333
1,HR,45000,2,1.0,47500.0
2,IT,65000,5,3.0,62333.333333
3,Finance,70000,4,1.0,70000.0
4,HR,50000,3,2.0,47500.0
5,IT,62000,2,2.0,62333.333333


# Titanic Dataset Exploration

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns

In [9]:
titanic = sns.load_dataset('titanic')

In [22]:
# 1. Initial exploration
print(titanic.info())
print(">>>>>>>>>> Basic Statistics:")
print(titanic.describe())

print(">>>>>>>>>>>> Dataset Shape:", titanic.shape)
print("Data Types and Missing Values:")
print(titanic.info())

missing_stats = (titanic.isna().sum() / len(titanic) * 100).round(2)
print(">>>>>>>>>>>> Missing Valuesn (%):")
print(missing_stats[missing_stats > 0])


titanic.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None
>>>>>>>>>> Basic Statistics:
         survived      pclass         age    

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [23]:
categorical_cols = ['pclass', 'sex', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive']
for col in categorical_cols:
    print(f"\nDistribution of{col}:")
    print(titanic[col].value_counts(normalize=True).round(3) * 100, "% ")


Distribution ofpclass:
pclass
3    55.1
1    24.2
2    20.7
Name: proportion, dtype: float64 % 

Distribution ofsex:
sex
male      64.8
female    35.2
Name: proportion, dtype: float64 % 

Distribution ofembarked:
embarked
S    72.4
C    18.9
Q     8.7
Name: proportion, dtype: float64 % 

Distribution ofclass:
class
Third     55.1
First     24.2
Second    20.7
Name: proportion, dtype: float64 % 

Distribution ofwho:
who
man      60.3
woman    30.4
child     9.3
Name: proportion, dtype: float64 % 

Distribution ofadult_male:
adult_male
True     60.3
False    39.7
Name: proportion, dtype: float64 % 

Distribution ofdeck:
deck
C    29.1
B    23.2
D    16.3
E    15.8
A     7.4
F     6.4
G     2.0
Name: proportion, dtype: float64 % 

Distribution ofembark_town:
embark_town
Southampton    72.4
Cherbourg      18.9
Queenstown      8.7
Name: proportion, dtype: float64 % 

Distribution ofalive:
alive
no     61.6
yes    38.4
Name: proportion, dtype: float64 % 


In [1]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [10, 20, 30, 40],
    'C': [100, 200, 300, 400],
    'D': [1000, 2000, 3000, 4000]
})

df.loc[df.A > 2, :]

NameError: name 'pd' is not defined