# Pandas Exercise

In [161]:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
import pandas as pd

In [162]:
def df_info(df: pd.DataFrame) -> None:
    return df.head(n=20).style

In [163]:
cols = [
    'Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
    'Parch', 'Fare', 'Cabin', 'Embarked'
]

df = pd.read_csv("../data/titanic/dataset.csv")
print(df.columns)

df = pd.DataFrame(df[cols], index=df["PassengerId"])
df.index.name = 'ID'

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [164]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C85,C
2,1.0,3.0,female,26.0,0.0,0.0,7.925,,S
3,1.0,1.0,female,35.0,1.0,0.0,53.1,C123,S
4,0.0,3.0,male,35.0,0.0,0.0,8.05,,S
5,0.0,3.0,male,,0.0,0.0,8.4583,,Q
6,0.0,1.0,male,54.0,0.0,0.0,51.8625,E46,S
7,0.0,3.0,male,2.0,3.0,1.0,21.075,,S
8,1.0,3.0,female,27.0,0.0,2.0,11.1333,,S
9,1.0,2.0,female,14.0,1.0,0.0,30.0708,,C
10,1.0,3.0,female,4.0,1.0,1.0,16.7,G6,S


# Exercise 1:

- Replace nan values at age, pclass and cabin column by -1 for numeric and "None" for str values
- Change the dtype to int8 for age and pclass

In [165]:
df["Age"].fillna(value=-1, inplace=True)
df["Pclass"].fillna(value=-1, inplace=True)
df["Cabin"].fillna(value="None", inplace=True)

df[["Age", "Pclass"]] = df[["Age", "Pclass"]].astype(np.int8)

In [166]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1,female,38,1.0,0.0,71.2833,C85,C
2,1.0,3,female,26,0.0,0.0,7.925,,S
3,1.0,1,female,35,1.0,0.0,53.1,C123,S
4,0.0,3,male,35,0.0,0.0,8.05,,S
5,0.0,3,male,-1,0.0,0.0,8.4583,,Q
6,0.0,1,male,54,0.0,0.0,51.8625,E46,S
7,0.0,3,male,2,3.0,1.0,21.075,,S
8,1.0,3,female,27,0.0,2.0,11.1333,,S
9,1.0,2,female,14,1.0,0.0,30.0708,,C
10,1.0,3,female,4,1.0,1.0,16.7,G6,S


In [167]:
df_info(df[df.isnull().any(axis=1)])

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
61,1.0,1,female,38,0.0,0.0,80.0,B28,
829,1.0,1,female,62,0.0,0.0,80.0,B28,
891,,-1,,-1,,,,,


# Exercise 2:

- Drop all rows that contains a nan value

In [168]:
df.dropna(axis=0, inplace=True)

df_info(df[df.isnull().any(axis=1)])

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [169]:
df_info(df)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1.0,1,female,38,1.0,0.0,71.2833,C85,C
2,1.0,3,female,26,0.0,0.0,7.925,,S
3,1.0,1,female,35,1.0,0.0,53.1,C123,S
4,0.0,3,male,35,0.0,0.0,8.05,,S
5,0.0,3,male,-1,0.0,0.0,8.4583,,Q
6,0.0,1,male,54,0.0,0.0,51.8625,E46,S
7,0.0,3,male,2,3.0,1.0,21.075,,S
8,1.0,3,female,27,0.0,2.0,11.1333,,S
9,1.0,2,female,14,1.0,0.0,30.0708,,C
10,1.0,3,female,4,1.0,1.0,16.7,G6,S


# Exercise 3

- Compute the min, max, median and mean of the age for the groups:
    - Survived and Male
    - Survived and Female
    - Not survived and Male
    - Not survived and Female

In [170]:
grouped = df.groupby(["Survived","Sex"])

In [171]:
grouped.Age.agg(["min", "max", "median", "mean"])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,median,mean
Survived,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,female,-1,57,20,19.567901
0.0,male,-1,74,24,24.079229
1.0,female,-1,63,24,24.004329
1.0,male,-1,80,26,23.091743


# Exercise 4

- Compute the most likely age to survive (except the -1 fillvalue)

In [172]:
survived = (df["Survived"] == 1)
has_age = (df["Age"] > 0)
print(np.count_nonzero(survived))
print(np.count_nonzero(has_age))

idxs = (survived & has_age)
print(np.count_nonzero(idxs))

df_sliced = df[idxs]
print(df_sliced.head(n=10))

survived_age_counts = df_sliced.Age.value_counts()

print(F"Result: {survived_age_counts.iloc[0]}")

340
704
281
    Survived  Pclass     Sex  Age  SibSp  Parch     Fare Cabin Embarked
ID                                                                     
1        1.0       1  female   38    1.0    0.0  71.2833   C85        C
2        1.0       3  female   26    0.0    0.0   7.9250  None        S
3        1.0       1  female   35    1.0    0.0  53.1000  C123        S
8        1.0       3  female   27    0.0    2.0  11.1333  None        S
9        1.0       2  female   14    1.0    0.0  30.0708  None        C
10       1.0       3  female    4    1.0    1.0  16.7000    G6        S
11       1.0       1  female   58    0.0    0.0  26.5500  C103        S
15       1.0       2  female   55    0.0    0.0  16.0000  None        S
21       1.0       2    male   34    0.0    0.0  13.0000   D56        S
22       1.0       3  female   15    0.0    0.0   8.0292  None        Q
Result: 15


In [174]:
df_sliced = df[(df["Survived"] == 1) & (df["Age"] > 0)]

survived_age_counts = df_sliced.Age.value_counts()

print(F"Result: {survived_age_counts.iloc[0]}")

Result: 15
