In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
from sklearn.model_selection import GridSearchCV

Above this declare all the libraries

In [2]:
train_df = pd.read_csv(r"C:\Users\varun\Desktop\Udacity\titanic\train.csv")
test_df = pd.read_csv(r"C:\Users\varun\Desktop\Udacity\titanic\test.csv")
print(train_df.shape)
print(test_df.shape)
train_df.head()

(891, 12)
(418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data exploration No manipulation must be done here

In [4]:
#check the survival with respect to Pclass
train_df[['Pclass','Survived']].groupby('Pclass').mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [5]:
train_df[['Sex','Survived']].groupby('Sex').mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [6]:
train_df[['SibSp','Survived']].groupby('SibSp').mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
1,0.535885
2,0.464286
0,0.345395
3,0.25
4,0.166667
5,0.0
8,0.0


In [7]:
train_df[['Parch','Survived']].groupby('Parch').mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
3,0.6
1,0.550847
2,0.5
0,0.343658
5,0.2
4,0.0
6,0.0


Data manipulation

In [8]:
#converting categorial variable sex into encoding
labelencoder = LabelEncoder()
train_df['Sex'] = labelencoder.fit_transform(train_df['Sex'])
test_df['Sex'] = labelencoder.fit_transform(test_df['Sex'])
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S


In [9]:
bins = [0,16,32,48,64,200]
labels = [0,1,2,3,4]
train_df['Age Bin'] = pd.cut(train_df['Age'], bins=bins, labels=labels)
test_df['Age Bin'] = pd.cut(test_df['Age'], bins=bins, labels=labels)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Bin
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,2


In [10]:
train_df['Family size'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family size'] = test_df['SibSp'] + test_df['Parch'] + 1
train_df[['Family size','Survived']].groupby('Family size').mean().sort_values(by = 'Survived', ascending = False)

Unnamed: 0_level_0,Survived
Family size,Unnamed: 1_level_1
4,0.724138
3,0.578431
2,0.552795
7,0.333333
1,0.303538
5,0.2
6,0.136364
8,0.0
11,0.0


In [11]:
train_df['Fam_type'] = pd.cut(train_df['Family size'], [0,1,4,7,11], labels=['Solo', 'Small', 'Big', 'Very big'])
test_df['Fam_type'] = pd.cut(test_df['Family size'], [0,1,4,7,11], labels=['Solo', 'Small', 'Big', 'Very big'])

In [12]:
combine = [train_df, test_df]
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [13]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Don', 'Sir', 'Jonkheer', 'Dona'],'Royalty')
    dataset['Title'] = dataset['Title'].replace(['Capt', 'Col','Dr','Major','Rev'],'Special')

train_df[['Title','Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Royalty,0.6
5,Special,0.277778


In [14]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Bin,Family size,Fam_type,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,1,2,Small,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,2,2,Small,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,Solo,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,2,2,Small,Mrs
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,2,1,Solo,Mr


In [15]:
#first features extraction
y = train_df['Survived']
features = ['Pclass','Sex','Fam_type','Fare','Age Bin','Embarked']
X = train_df[features]
X.head()

Unnamed: 0,Pclass,Sex,Fam_type,Fare,Age Bin,Embarked
0,3,1,Small,7.25,1,S
1,1,0,Small,71.2833,2,C
2,3,0,Solo,7.925,1,S
3,1,0,Small,53.1,2,S
4,3,1,Solo,8.05,2,S


Any features if generated must be done above this point in the notebook

In [31]:
numerical_col = ['Fare']
categorical_col = ['Pclass','Sex','Fam_type','Age Bin','Embarked']
num_trans = SimpleImputer(strategy = 'median')
cat_trans = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('onehot',OneHotEncoder())
])
preprocessor = ColumnTransformer(
    transformers = [
        ('num',num_trans,numerical_col),
        ('cat',cat_trans,categorical_col)
])
titanic_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestClassifier(random_state=0
                                                               ))
                                  ])
#titanic_pipeline.fit(X,y)
param_grid = {
    'model__max_depth': [2, 3, 4, 5],
    'model__min_samples_leaf': [3, 4, 5],
    'model__min_samples_split': [6, 8, 10, 12],
    'model__n_estimators': [100, 200, 300, 500]
}
search = GridSearchCV(titanic_pipeline, param_grid, n_jobs=-1)
search.fit(X, y)
print(search.best_params_)
#print('Cross validation score: {:.3f}'.format(cross_val_score(titanic_pipeline, X, y, cv=10).mean()))

{'model__max_depth': 5, 'model__min_samples_leaf': 5, 'model__min_samples_split': 12, 'model__n_estimators': 300}


In [33]:
numerical_col = ['Fare']
categorical_col = ['Pclass','Sex','Fam_type','Age Bin','Embarked']
num_trans = SimpleImputer(strategy = 'median')
cat_trans = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('onehot',OneHotEncoder())
])
preprocessor = ColumnTransformer(
    transformers = [
        ('num',num_trans,numerical_col),
        ('cat',cat_trans,categorical_col)
])
titanic_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestClassifier(random_state=0,
                                                               max_depth = 5,
                                                               #min_samples_leaf = 5,
                                                               #min_samples_split = 12,
                                                               n_estimators = 500
                                                               ))
                                  ])
titanic_pipeline.fit(X,y)
print('Cross validation score: {:.3f}'.format(cross_val_score(titanic_pipeline, X, y, cv=10).mean()))

Cross validation score: 0.822


In [20]:
X_test = test_df[features]
X_test.head()

Unnamed: 0,Pclass,Sex,Fam_type,Fare,Age Bin,Embarked
0,3,1,Solo,7.8292,2,Q
1,3,0,Small,7.0,2,S
2,2,1,Solo,9.6875,3,Q
3,3,1,Solo,8.6625,1,S
4,3,0,Small,12.2875,1,S


In [21]:
predictions = titanic_pipeline.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('my_submission2.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
