In [27]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

In [2]:
def scores(y_true, y_pred):
    results = [accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred), roc_auc_score(y_true, y_pred)]
    return [round(x, 2) for x in results]

In [3]:
def create_models(X_train, X_test, y_train, y_test):
    print('Logistic Regression')
    model = LogisticRegressionCV()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')
    
    print('Decision Tree')
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')
    
    print('SVM')
    model = SVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')
    
    print('KNN')
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))
    
    print('\n')

    print('Random Forest')
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = scores(y_test, y_pred)
    print('Accuracy: {}, Precision: {}, Recall: {}, F1:{}, ROC_AUC:{}'.format(results[0], results[1], results[2], results[3], results[4]))

In [4]:
data=pd.read_csv('titanic/train.csv')

In [5]:
data['Initial']=0
for i in data:
    data['Initial']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations

In [6]:
data['Initial'].replace(
    ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
    ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

In [7]:
## Assigning the NaN Values with the Ceil values of the mean ages
data.loc[(data.Age.isnull())&(data.Initial=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Initial=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Initial=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Initial=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Initial=='Other'),'Age']=46

In [8]:
data['Cabin'] = data['Cabin'].apply(lambda x:not isinstance(x, float))

In [9]:
data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [10]:
label_encoder = {}

for x in ['Sex', 'Embarked', 'Initial']:
    label_encoder[x] = LabelEncoder()
    data[x] = label_encoder[x].fit_transform(data[x])

In [37]:
X = data[data.columns[1:]].values
y = data['Survived']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [39]:
create_models(X_train, X_test, y_train, y_test)

Logistic Regression
Accuracy: 0.82, Precision: 0.8, Recall: 0.76, F1:0.78, ROC_AUC:0.81


Decision Tree
Accuracy: 0.8, Precision: 0.77, Recall: 0.76, F1:0.76, ROC_AUC:0.8


SVM
Accuracy: 0.65, Precision: 0.75, Recall: 0.24, F1:0.37, ROC_AUC:0.59


KNN
Accuracy: 0.74, Precision: 0.74, Recall: 0.58, F1:0.65, ROC_AUC:0.72


Random Forest
Accuracy: 0.84, Precision: 0.81, Recall: 0.81, F1:0.81, ROC_AUC:0.84


---

In [53]:
model = RandomForestClassifier(random_state=42)
score = cross_validate(model, X, y, cv=10, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [54]:
score['test_score'].mean()

0.8159675405742822