In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

train = pd.read_csv ('data_titanic/titanic_train_master.csv')
test = pd.read_csv ('data_titanic/titanic_test_master.csv')

train.columns = map (str.lower, train.columns)
test.columns = map (str.lower, test.columns)

In [2]:
# columns with missing values
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:

passengerid = train['passengerid']
survived = train['survived']
pclass = train['pclass']
name = train['name']
sex = train['sex']
age = train['age']
sibsp = train['sibsp']
parch = train['parch']
ticket = train['ticket']
fare = train['fare']
embarked = train['embarked']

In [4]:
# create dummies
def create_dummies (df, column_names):
    for col in column_names:
        dummies = pd.get_dummies (df[col], prefix=col)
        df = pd.concat([df, dummies], axis=1)
    return df

In [5]:
# pclass
pclass.value_counts()

3    491
1    216
2    184
Name: pclass, dtype: int64

In [5]:
# sex
sex.value_counts()

male      577
female    314
Name: sex, dtype: int64

In [27]:
age.isna().value_counts()

False    714
True     177
Name: age, dtype: int64

In [22]:
# age has nulls. Write out records and view
train[age.isna()][['name', 'age']].to_csv ('null_ages.csv', index=False)

In [36]:
# let's work on name and then come back to age. extract titles from name
extracted_titles = name.str.extract (' ([A-Za-z]+)\.', expand=False)
extracted_titles.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Don           1
Jonkheer      1
Mme           1
Capt          1
Sir           1
Lady          1
Ms            1
Countess      1
Name: name, dtype: int64

In [38]:
# map the titles
titles = {
    "Mr" :         "mr",
    "Mme":         "mrs",
    "Ms":          "mrs",
    "Mrs" :        "mrs",
    "Master" :     "master",
    "Mlle":        "miss",
    "Miss" :       "miss",
    "Capt":        "officer",
    "Col":         "officer",
    "Major":       "officer",
    "Dr":          "officer",
    "Rev":         "officer",
    "Jonkheer":    "royalty",
    "Don":         "royalty",
    "Sir" :        "royalty",
    "Countess":    "royalty",
    "Dona":        "royalty",
    "Lady" :       "royalty"
}
train['title'] = extracted_titles.map(titles)
titles = train['title']

In [None]:
# if title_master is true and age is null set age to 1. Then group ages.



In [5]:
# sibsp
train['sibsp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: sibsp, dtype: int64

In [6]:
# parch
train['parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: parch, dtype: int64

In [7]:
# embarked. It has empty values. Let's fill with U for unknown
train['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64