In [1]:
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, KBinsDiscretizer
from collections import Counter

SEED = 5678
np.random.seed(SEED)

In [3]:
train = pd.read_csv(r'C:\Users\Me\Kaggle\Titanic_revisited\data\train.csv')
print(train.shape)

(891, 12)


# New Data

In [4]:
#Split to X,y
X = train.copy()
y = train['Survived'].copy()
X.drop(['Survived'], axis = 1, inplace = True)
print(y.shape, X.shape)

(891,) (891, 11)


In [5]:
#Deck
X['Deck'] = X['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,M
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,M
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,M


In [6]:
X['Deck'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [7]:
# Drop Columns

X.drop(labels = [ "PassengerId",  "Cabin"], axis = 1, inplace = True)

In [8]:
#ticket frequency
X['Ticket_Frequency'] = X.groupby('Ticket')['Ticket'].transform('count')

X.drop(labels = [ "Ticket"], axis = 1, inplace = True)

In [9]:
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,M,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,C,1
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,M,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,C,2
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,M,1


In [10]:
#Extract title before dropping name
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in X["Name"]]
X["Title"] = pd.Series(dataset_title)
X["Title"].head()

# Convert to categorical values Title train
X["Title"] = X["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir',
                                 'Jonkheer', 'Dona'], 'Rare')
X["Title"] = X["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
X["Title"] = X["Title"].astype(int)

X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,M,1,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,C,1,1
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,M,1,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,C,2,1
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,M,1,2


In [11]:
# Create Dummies
X['Embarked'] = X['Embarked'].fillna('S')
le = LabelEncoder()

dummies = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck']
for dum in dummies:
    X[dum] = le.fit_transform(X[dum])

In [12]:
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title
0,2,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,2,7,1,2
1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,0,2,1,1
2,2,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,2,7,1,1
3,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,2,2,2,1
4,2,"Allen, Mr. William Henry",1,35.0,0,0,8.05,2,7,1,2


In [13]:
#names = X['Name'][X['Embarked' == 1] and X['Pclass'] == 0]
names = ( X['Name'][ (X['Pclass'] == 0) &  (X['Embarked'] == 1 ) ])
print(names)

245    Minahan, Dr. William Edward
412         Minahan, Miss. Daisy E
Name: Name, dtype: object


In [14]:
# Drop Columns
X.drop(labels = ['Name'], axis = 1, inplace = True)

In [15]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title
0,2,1,22.0,1,0,7.25,2,7,1,2
1,0,0,38.0,1,0,71.2833,0,2,1,1
2,2,0,26.0,0,0,7.925,2,7,1,1
3,0,0,35.0,1,0,53.1,2,2,2,1
4,2,1,35.0,0,0,8.05,2,7,1,2


# Impute Age and fare for test

In [16]:
X.isna().any()

Pclass              False
Sex                 False
Age                  True
SibSp               False
Parch               False
Fare                False
Embarked            False
Deck                False
Ticket_Frequency    False
Title               False
dtype: bool

In [17]:
X['Age'] = X['Age'].fillna(X['Age'].value_counts().argmax())

In [18]:
X.isna().any()

Pclass              False
Sex                 False
Age                 False
SibSp               False
Parch               False
Fare                False
Embarked            False
Deck                False
Ticket_Frequency    False
Title               False
dtype: bool

In [19]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title
0,2,1,22.0,1,0,7.25,2,7,1,2
1,0,0,38.0,1,0,71.2833,0,2,1,1
2,2,0,26.0,0,0,7.925,2,7,1,1
3,0,0,35.0,1,0,53.1,2,2,2,1
4,2,1,35.0,0,0,8.05,2,7,1,2


In [20]:
Counter(y)

Counter({0: 549, 1: 342})

# Feature Engineering

In [21]:
#Family Total
X['tot_fam'] = X['Parch'] + X['SibSp']
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title,tot_fam
0,2,1,22.0,1,0,7.25,2,7,1,2,1
1,0,0,38.0,1,0,71.2833,0,2,1,1,1
2,2,0,26.0,0,0,7.925,2,7,1,1,0
3,0,0,35.0,1,0,53.1,2,2,2,1,1
4,2,1,35.0,0,0,8.05,2,7,1,2,0


In [22]:
#Solo

X['solo'] = X['tot_fam'].apply(lambda x: 1 if x == 0 else 0 )
X.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title,tot_fam,solo
0,2,1,22.0,1,0,7.25,2,7,1,2,1,0
1,0,0,38.0,1,0,71.2833,0,2,1,1,1,0
2,2,0,26.0,0,0,7.925,2,7,1,1,0,1
3,0,0,35.0,1,0,53.1,2,2,2,1,1,0
4,2,1,35.0,0,0,8.05,2,7,1,2,0,1
5,2,1,24.0,0,0,8.4583,1,7,1,2,0,1
6,0,1,54.0,0,0,51.8625,2,4,1,2,0,1
7,2,1,2.0,3,1,21.075,2,7,4,0,4,0
8,2,0,27.0,0,2,11.1333,2,7,3,1,2,0
9,1,0,14.0,1,0,30.0708,0,7,2,1,1,0


In [23]:
#Child
age_var = 10

X["Child"] = float('NaN')

X["Child"][X["Age"] < age_var] = 1
X["Child"][X["Age"] >= age_var] = 0

In [24]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title,tot_fam,solo,Child
0,2,1,22.0,1,0,7.25,2,7,1,2,1,0,0.0
1,0,0,38.0,1,0,71.2833,0,2,1,1,1,0,0.0
2,2,0,26.0,0,0,7.925,2,7,1,1,0,1,0.0
3,0,0,35.0,1,0,53.1,2,2,2,1,1,0,0.0
4,2,1,35.0,0,0,8.05,2,7,1,2,0,1,0.0


In [25]:
X = pd.concat([y, X ], axis = 1)
X.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Ticket_Frequency,Title,tot_fam,solo,Child
0,0,2,1,22.0,1,0,7.25,2,7,1,2,1,0,0.0
1,1,0,0,38.0,1,0,71.2833,0,2,1,1,1,0,0.0
2,1,2,0,26.0,0,0,7.925,2,7,1,1,0,1,0.0
3,1,0,0,35.0,1,0,53.1,2,2,2,1,1,0,0.0
4,0,2,1,35.0,0,0,8.05,2,7,1,2,0,1,0.0


In [26]:
X.shape

(891, 14)

In [27]:
X.to_csv("tidy_Titanic.csv")