# Prepare ready-to go titatnic dataset

Building on earlier insights process titanic dataset to make it ready for ML.

#### Load & definitions

In [1]:
import pandas as pd
import numpy as np

import sklearn.preprocessing as sk_pp
import sklearn.model_selection as sk_ms

import os

import collections as coll

import pickle as pkl

In [2]:
RAW_CSV_PATH = os.path.join(os.getcwd(), 'titanic_dataset.csv')
RAW_DF = pd.read_csv(RAW_CSV_PATH, delimiter=';')

print('Columns available:')
#
for column in RAW_DF.columns:
    print(f'\t{column}')

Columns available:
	PassengerId
	Survived
	Pclass
	Name
	Sex
	Age
	SibSp
	Parch
	Ticket
	Fare
	Cabin
	Embarked


## Build processed dataframe

In [3]:
PROC_DF = pd.DataFrame()

### Label to binary

In [4]:
LABEL_NAME = 'Survived'
PROC_DF[LABEL_NAME] = np.array(RAW_DF[LABEL_NAME]=='Yes', dtype=int)

### Pclass

No missing values, nothing special, can go in as is, but it makes sense to treat them as categorical

In [5]:
PCLASS_NAME = 'Pclass'

# one-hot encode
pclass_enc = sk_pp.OneHotEncoder(sparse=False)
pclass_mat = pclass_enc.fit_transform(RAW_DF[PCLASS_NAME].values[:,None])

# add to dataframe
for i_cat, pclass_cat in enumerate(pclass_enc.categories_[0]):
    PROC_DF[f'{PCLASS_NAME}_{pclass_cat}'] = pclass_mat[:, i_cat]

### Sex

Male/female, no missing. Female have much better chance of surviving

In [6]:
PROC_DF['is_female'] = np.array(RAW_DF['Sex']=='female', dtype=int)

### Age

Significant number passengers are missing age and their survival odds are low. Set the missing age to -1, so that classifier could have good chance of picking up this trend.

In [7]:
full_age = np.ones(len(PROC_DF), dtype=float) * (-1)
il_age_present = (RAW_DF.Age.isnull()==False)
full_age[il_age_present] = RAW_DF[il_age_present].Age

PROC_DF['age'] = full_age

### SibSp

No missing values. If 1, survival odds are about 1, if less or more survival odds are lower. Also, there are few passengers with more than 1sp, so to avoid over-fitting

In [8]:
PROC_DF['sibsp_simp'] = [sibsp if sibsp<=1 else 2 for sibsp in RAW_DF.SibSp] 

### Parch
No missing values for number of parents and children. 0, 1, 2, should be left as is, higher can be aggregated to avoid overfitting

In [9]:
PROC_DF['parch_simp'] = [parch if parch<=2 else 3 for parch in RAW_DF.Parch]

### Fare

In [10]:
PROC_DF['fare'] = RAW_DF['Fare']

### Cabin
Two things can be extracted from cabin, deck, including deck MISSING, which gave significantly lower survival odds, and the number of passengers sharing a cabin with the given passenger. In some cases multiple tickets appear in the same line, e.g. `C22 C26`. In all but one cases, these are on the same deck

In [11]:
CABIN_NAME = 'Cabin'

# get the list of decks
deck_list = [cab[0] if (type(cab)==str and len(cab)>0) else 'MISSING' for cab in RAW_DF[CABIN_NAME]]
deck_enc = sk_pp.OneHotEncoder(sparse=False)
deck_mat = deck_enc.fit_transform(np.array(deck_list)[:,None])
#
# add to dataframe
for i_deck, deck_cat in enumerate(deck_enc.categories_[0]):
    PROC_DF[f'deck_{deck_cat}'] = deck_mat[:, i_deck]
    
# cabin reps encodes how many times the ticket of the passenger has been 
# repeated, i.e. due to other passengers also holding it
ticket_counter = coll.Counter(RAW_DF[CABIN_NAME])
PROC_DF['ticket_counter'] = [ticket_counter[ticket] for ticket in RAW_DF[CABIN_NAME]]

In [12]:
PROC_DF

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,is_female,age,sibsp_simp,parch_simp,fare,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_MISSING,deck_T,ticket_counter
0,0,0.0,1.0,0.0,0,28.0,0,0,13.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,687
1,0,0.0,0.0,1.0,0,25.0,0,0,7.6500,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2
2,0,0.0,0.0,1.0,0,20.0,0,0,7.8542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,687
3,0,0.0,0.0,1.0,1,29.0,0,3,21.0750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,687
4,0,1.0,0.0,0.0,0,31.0,1,0,52.0000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0.0,0.0,1.0,0,-1.0,0,0,7.7375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,687
887,0,1.0,0.0,0.0,0,50.0,1,0,55.9000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
888,1,1.0,0.0,0.0,1,24.0,2,2,263.0000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4
889,1,1.0,0.0,0.0,0,32.0,0,0,30.5000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


### Embarked

Could have embarked from C = Cherbourg, Q = Queenstown, S = Southampton and there are two missing

In [13]:
EMBARKED_NAME = 'Embarked'

# get the list of decks
emb_list = [emb if emb is not None else 'MISSING' for emb in RAW_DF[EMBARKED_NAME]]
emb_enc = sk_pp.OneHotEncoder(sparse=False)
emb_mat = emb_enc.fit_transform(np.array(emb_list)[:,None])
#
# add to dataframe
for i_emb, emb_cat in enumerate(emb_enc.categories_[0]):
    PROC_DF[f'embarked_{emb_cat}'] = emb_mat[:, i_emb]

### Name -> Title -> is_miss

Presence of 'Miss.' in the name allows separating the married and un-married women. Will encode it as a feature

In [14]:
IS_MISS_NAME = 'is_miss'

PROC_DF[IS_MISS_NAME] = np.array(RAW_DF.Name.str.contains('Miss.'), dtype=int)

## Ready to store

In [15]:
FEATURES_DF = PROC_DF.copy()
LABEL_SRS = FEATURES_DF.pop('Survived') 

In [16]:
LABEL_SRS

0      0
1      0
2      0
3      0
4      0
      ..
886    0
887    0
888    1
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [17]:
TRAIN_DF, TEST_DF, TRAIN_LABEL, TEST_LABEL = sk_ms.train_test_split(FEATURES_DF, LABEL_SRS, stratify=LABEL_SRS, random_state=42)

with open('train_test_data.pkl', 'wb') as fh:
    pkl.dump(
        {'TRAIN_DF': TRAIN_DF, 'TEST_DF': TEST_DF, 'TRAIN_LABEL': TRAIN_LABEL, 'TEST_LABEL': TEST_LABEL},
        fh
    )