In [61]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

In [62]:
def scores(y_true, y_pred):
    results = [accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred), roc_auc_score(y_true, y_pred)]
    return [round(x, 2) for x in results]

In [63]:
def create_models(X_train, X_test, y_train, y_test):
    print('Logistic Regression')
    model = LogisticRegressionCV()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')
    
    print('Decision Tree')
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')
    
    print('SVM')
    model = SVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')
    
    print('KNN')
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')

    print('Random Forest')
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))

In [64]:
data=pd.read_csv('../titanic/train.csv')

In [65]:
data['Initial']=0
for i in data:
    data['Initial']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations

In [66]:
data['Initial'].replace(
    ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
    ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

In [67]:
## Assigning the NaN Values with the Ceil values of the mean ages
data.loc[(data.Age.isnull())&(data.Initial=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Initial=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Initial=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Initial=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Initial=='Other'),'Age']=46

In [68]:
data['Cabin'] = data['Cabin'].apply(lambda x:not isinstance(x, float))

In [69]:
data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [70]:
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Initial
0,0,3,male,22.0,1,0,7.2500,False,S,Mr
1,1,1,female,38.0,1,0,71.2833,True,C,Mrs
2,1,3,female,26.0,0,0,7.9250,False,S,Miss
3,1,1,female,35.0,1,0,53.1000,True,S,Mrs
4,0,3,male,35.0,0,0,8.0500,False,S,Mr
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,False,S,Other
887,1,1,female,19.0,0,0,30.0000,True,S,Miss
888,0,3,female,22.0,1,2,23.4500,False,S,Miss
889,1,1,male,26.0,0,0,30.0000,True,C,Mr


In [45]:
label_encoder = {}

for x in ['Sex', 'Embarked', 'Initial']:
    label_encoder[x] = LabelEncoder()
    data[x] = label_encoder[x].fit_transform(data[x])
    pd.to_pickle(label_encoder[x], x+'.pkl')

In [46]:
X = data[data.columns[1:]].values
y = data['Survived']

In [60]:
X[0]

array([3, 1, 22.0, 1, 0, 7.25, False, 2, 2], dtype=object)

In [58]:
data[data.columns[1:]]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Initial
0,3,1,22.0,1,0,7.2500,False,2,2
1,1,0,38.0,1,0,71.2833,True,0,3
2,3,0,26.0,0,0,7.9250,False,2,1
3,1,0,35.0,1,0,53.1000,True,2,3
4,3,1,35.0,0,0,8.0500,False,2,2
...,...,...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,False,2,4
887,1,0,19.0,0,0,30.0000,True,2,1
888,3,0,22.0,1,2,23.4500,False,2,1
889,1,1,26.0,0,0,30.0000,True,0,2


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [49]:
create_models(X_train, X_test, y_train, y_test)

Logistic Regression
Accuracy: 0.82, Precision: 0.8, Recall: 0.76, F1:0.78, ROC_AUC:0.81


Decision Tree
Accuracy: 0.8, Precision: 0.77, Recall: 0.74, F1:0.76, ROC_AUC:0.8


SVM
Accuracy: 0.65, Precision: 0.75, Recall: 0.24, F1:0.37, ROC_AUC:0.59


KNN
Accuracy: 0.74, Precision: 0.74, Recall: 0.58, F1:0.65, ROC_AUC:0.72


Random Forest
Accuracy: 0.83, Precision: 0.79, Recall: 0.78, F1:0.79, ROC_AUC:0.82


In [50]:
model = RandomForestClassifier(random_state=42)
score = cross_validate(model, X, y, cv=10, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [51]:
score['test_score'].mean()

0.8159675405742822

In [53]:
model = RandomForestClassifier(random_state=42)

In [54]:
model.fit(X, y)

In [55]:
pd.to_pickle(model, 'RF_.pkl')