<h1 style="margin-bottom: 25px;font-size:3.5rem;color:#4c76ce;text-align:center;">
    Getting Started with Pandas</h1>

<h2 style="margin-bottom: 25px;font-size:2.5rem;text-align:center;">
    Part III - Pandas: Data Aggregation</h2>
    
<img src="https://raw.githubusercontent.com/lajmcourses/Images/master/pandas.png"
     style="position:absolute;top:5px;left:25px;height:150px;width:auto;margin-bottom:25px;">

In [29]:
import numpy as np
import pandas as pd

# Set decimal number display precision (Pandas objects)
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format

In [30]:
# Data

df_data = pd.read_csv("data/titanic.csv")
df_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
df_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [20]:
df_data.describe(include="object")

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [21]:
df_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## 1. Data Grouping

In [31]:
# Group by Class
group_by_class = df_data.groupby("Pclass")

# Display Groups
for cls_name, cls_df in group_by_class:
    ## Group name
    print(f"Group: {cls_name}")
    display(cls_df.head())




Group: 1


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.86,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S


Group: 2


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.07,,C
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
20,21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S


Group: 3


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.46,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.07,,S


## 2. Group Aggregation Functions

In [32]:
# Group Mean:
group_by_class[["Survived", "Age", "Fare"]].mean()

Unnamed: 0_level_0,Survived,Age,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.63,38.23,84.15
2,0.47,29.88,20.66
3,0.24,25.14,13.68


In [33]:
# Group Count: Passengers by Class
display(group_by_class[["PassengerId"]].count())


Unnamed: 0_level_0,PassengerId
Pclass,Unnamed: 1_level_1
1,216
2,184
3,491


In [34]:
# Group Sum: Survived Passengers by Class
display(group_by_class[["Survived"]].sum())

# We can also apply multiple aggregation functions
group_by_class[["Survived"]].agg(["sum", "count"])

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,136
2,87
3,119


Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,sum,count
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,136,216
2,87,184
3,119,491


In [35]:
# Multiple Aggregation Functions on a Single Column 
group_by_class[["Age"]].agg(['mean', 'std', 'median'])

Unnamed: 0_level_0,Age,Age,Age
Unnamed: 0_level_1,mean,std,median
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,38.23,14.8,37.0
2,29.88,14.0,29.0
3,25.14,12.5,24.0


In [36]:
# Multiple Aggregation Function on Multiple Columns
group_by_class.agg(
    mean_age = ("Age", lambda x: x.mean()),
    std_age = ("Age", "std"),
    
    mean_fare = ("Fare", lambda x: x.mean()),
    std_fare = ("Fare", "std")
    )

Unnamed: 0_level_0,mean_age,std_age,mean_fare,std_fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,38.23,14.8,84.15,78.38
2,29.88,14.0,20.66,13.42
3,25.14,12.5,13.68,11.78


## 3.Grouping by Multiple Variables

In [37]:
group_by_class_by_age = df_data.groupby(["Pclass", "Sex"])

group_by_class_by_age[["Age", "Fare"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Fare
Pclass,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1,female,34.61,106.13
1,male,41.28,67.23
2,female,28.72,21.97
2,male,30.74,19.74
3,female,21.75,16.12
3,male,26.51,12.66


## User Defined Aggregation Functions

In [38]:
# Example 1

group_by_class.apply(lambda df_data: pd.Series(df_data.shape, index=["nrow", "ncol"]))

Unnamed: 0_level_0,nrow,ncol
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,216,12
2,184,12
3,491,12
