In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [3]:
test1 = pd.read_csv("/kaggle/input/titanic/test.csv")

In [4]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [5]:
def age_class(df):
    if df['Age'] < 20:
        val = '<20'
    elif (df['Age'] >= 20) & (df['Age']< 30):
        val = '20-30'
    elif (df['Age'] >=30) & (df['Age'] < 40):
        val = '30-40'
    elif (df['Age'] >=40) & (df['Age'] < 50):
        val = '40-50'
    elif (df['Age'] >=50) & (df['Age'] < 60):
        val = '50-60'
    elif df['Age'] > 60:
        val = '>60'
    else:
        val = 999  # to denote iregular value like NaN
    return val

def fare_seg(df):
    if df['Fare'] < 100:
        val = '<100'
    elif (df['Fare'] >= 100) & (df['Fare']< 200):
        val = '100-200'
    elif (df['Fare'] >=200) & (df['Fare'] < 300):
        val = '200-300'
    else:
        val = '>300'
    return val

def missing_df(train):
    missing_df = pd.DataFrame(columns = ['Feature', 'dtype', 'Missing Values', '%age missing'])
    print('missing values in these columns:')
    for i in train.columns:
        if train[i].isnull().sum() != 0:
            temp_df = pd.DataFrame([[i,
                                     train[i].dtype,
                                     train[i].isnull().sum(), 
                                     round(train[i].isnull().sum()/len(train), 2)
                                    ]], 
                                   columns = ['Feature', 'dtype', 'Missing Values', '%age missing'])
            missing_df = missing_df.append(temp_df, ignore_index = True)
    return missing_df

def embarked_missing(x_train, x_test = None):

    x_train['age_grp'] = x_train.apply(age_class, axis = 1)
    x_train['fare_grp'] = x_train.apply(fare_seg, axis = 1)

    try:
        x_test['age_grp'] = x_test.apply(age_class, axis = 1)
        x_test['fare_grp'] = x_test.apply(fare_seg, axis = 1)

        miss_val = x_test[x_test.Embarked.isnull()]  #now filling values in test dataset

        for i in miss_val.index:
            x = x_train[(x_train.Sex == miss_val.loc[i,'Sex']) &  #sex
                    (x_train.Pclass == miss_val.loc[i,'Pclass']) &  #plcass
                    (x_train.fare_grp == miss_val.loc[i,'fare_grp'])].Embarked.mode()
            x_test.loc[i,'Embarked'] = x[0]
        
        x_test.drop(['age_grp', 'fare_grp'], axis = 1, inplace = True)
        
    except:
        pass
        
    miss_val = x_train[x_train.Embarked.isnull()]  #assigning x_train missing indices

    for i in miss_val.index:
        x = x_train[(x_train.Sex == miss_val.loc[i,'Sex']) &  #sex
                (x_train.Pclass == miss_val.loc[i,'Pclass']) &  #plcass
                (x_train.fare_grp == miss_val.loc[i,'fare_grp'])].Embarked.mode() #same fare segment
        x_train.loc[i,'Embarked'] = x[0]
        
    x_train.drop(['age_grp', 'fare_grp'], axis = 1, inplace = True)

    return x_train, x_test

def age_missing(x_train, x_test = None):
    
    rf = RandomForestRegressor()
    imputer = IterativeImputer(estimator= rf)
    
    x_train_dummy = pd.get_dummies(x_train, columns=['Embarked', 'Sex'])
    x_train_dummy.drop(['Embarked_C', 'Sex_female'], 1, inplace = True)

    x_train_np = imputer.fit_transform(x_train_dummy)
    x_train['Age'] = x_train_np[:,1]
    
    try:
        x_test_dummy = pd.get_dummies(x_test, columns=['Embarked', 'Sex'])
        x_test_dummy.drop(['Embarked_C', 'Sex_female'], 1, inplace = True)
        x_test_np = imputer.transform(x_test_dummy)
        x_test['Age'] = x_test_np[:,1]

    except:
        pass

    return x_train, x_test


In [6]:
unwanted_cols = ['PassengerId', 'Name', 'Ticket']
df.drop(unwanted_cols, axis = 1, inplace= True)
test.drop(unwanted_cols, axis = 1, inplace= True)

df.drop(['Cabin'], axis = 1, inplace = True)
test.drop(['Cabin'], axis = 1, inplace = True)



In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], 1), df['Survived'], test_size = 0.33, stratify=df.Survived, random_state = 42)
X_train.reset_index(inplace= True, drop = True)
X_test.reset_index(inplace= True, drop = True)
y_train.reset_index(inplace = True, drop = True)
y_test.reset_index(inplace = True, drop = True)

index_x_trains = X_train.Fare < 300
X_train = X_train.loc[index_x_trains]
y_train = y_train[index_x_trains]
X_train.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)

X_train_hyper = X_train.copy(deep = True)
X_test_hyper = X_test.copy(deep = True)

  """Entry point for launching an IPython kernel.


In [8]:
index_fare_le_300 = df.Fare < 300
df = df[index_fare_le_300]
df_x = df.drop('Survived', 1)
df_y = df.Survived

  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
skf = StratifiedKFold(n_splits = 5)
from sklearn.preprocessing import StandardScaler
ss= StandardScaler()

mla_cols = ['MLA Name', 'MLA Accuracy1', 'MLA Accuracy2', 'MLA Accuracy3', 'MLA Accuracy4', 'MLA Accuracy5']
mla_compare = pd.DataFrame(columns=mla_cols)

lr = LogisticRegression()
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()
svc = SVC()
gnb = GaussianNB()

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
mla_cols = ['MLA Name', 'MLA Accuracy1', 'MLA Accuracy2', 'MLA Accuracy3', 'MLA Accuracy4', 'MLA Accuracy5']
mla_compare = pd.DataFrame(columns=mla_cols)
i = 1

lr = LogisticRegression()
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()
svc = SVC()
gnb = GaussianNB()
ss= StandardScaler()
skf = StratifiedKFold(n_splits = 5)

for train_index, test_index in skf.split(X_train, y_train):
    X_train_cv = X_train.loc[train_index,]
    y_train_cv = y_train[train_index]
    X_test_cv = X_train.loc[test_index,]
    y_test_cv = y_train[test_index]

    X_train_cv, X_test_cv = embarked_missing(X_train_cv, x_test = X_test_cv)
    X_train_cv, X_test_cv = age_missing(X_train_cv, x_test= X_test_cv)

    x_train_num = pd.get_dummies(X_train_cv, columns=['Embarked', 'Sex'], drop_first = True)
    #x_train_num.drop(['Embarked_C', 'Sex_female'], axis = 1, inplace = True)

    x_test_num = pd.get_dummies(X_test_cv, columns=['Embarked', 'Sex'], drop_first= True)
    #x_test_num.drop(['Embarked_C', 'Sex_female'], axis = 1, inplace = True)

    feat = ['Age', 'Fare', 'SibSp', 'Parch']
    x_train_scaled = x_train_num.copy(deep = True)
    x_train_scaled[feat] = ss.fit_transform(x_train_scaled[feat])

    x_test_scaled = x_test_num.copy(deep = True)
    x_test_scaled[feat] = ss.fit_transform(x_test_scaled[feat])

    lr.fit(x_train_scaled, y_train_cv)
    lr_score = lr.score(x_test_scaled, y_test_cv)
    mla_compare.loc[0,'MLA Name'] = 'Logistic Regression'
    mla_compare.loc[0,f'MLA Accuracy{i}'] = lr_score

    rf.fit(x_train_num, y_train_cv)
    score_rf = rf.score(x_test_num, y_test_cv)
    mla_compare.loc[1,'MLA Name'] = 'Random Forest'
    mla_compare.loc[1,f'MLA Accuracy{i}'] = score_rf

    dt.fit(x_train_num, y_train_cv)
    score_dt = dt.score(x_test_num, y_test_cv)
    mla_compare.loc[2,'MLA Name'] = 'Decision Tree'
    mla_compare.loc[2,f'MLA Accuracy{i}'] = score_dt

    svc.fit(x_train_scaled, y_train_cv)
    score_svm= lr.score(x_test_scaled, y_test_cv)
    mla_compare.loc[3,'MLA Name'] = 'Support Vector Machine Classifier'
    mla_compare.loc[3,f'MLA Accuracy{i}'] = score_svm


    gnb.fit(x_train_num, y_train_cv)
    score_gnb = gnb.score(x_test_num, y_test_cv)
    mla_compare.loc[4,'MLA Name'] = 'Gaussian Naives Bayes'
    mla_compare.loc[4,f'MLA Accuracy{i}'] = score_gnb

    i += 1


In [12]:
mla_compare

Unnamed: 0,MLA Name,MLA Accuracy1,MLA Accuracy2,MLA Accuracy3,MLA Accuracy4,MLA Accuracy5
0,Logistic Regression,0.781513,0.798319,0.840336,0.789916,0.779661
1,Random Forest,0.823529,0.815126,0.840336,0.781513,0.79661
2,Decision Tree,0.773109,0.831933,0.764706,0.714286,0.754237
3,Support Vector Machine Classifier,0.781513,0.798319,0.840336,0.789916,0.779661
4,Gaussian Naives Bayes,0.773109,0.756303,0.764706,0.781513,0.762712


In [13]:
X_train_hyper, X_test_hyper = embarked_missing(X_train_hyper, x_test = X_test_hyper)
X_train_hyper, X_test_hyper = age_missing(X_train_hyper, x_test = X_test_hyper)
x_train_num = pd.get_dummies(X_train_hyper, columns=['Embarked', 'Sex'], drop_first = True)
x_test_num = pd.get_dummies(X_test_hyper, columns=['Embarked', 'Sex'], drop_first= True)

In [14]:
# hyper parameter tuning
params = [{'n_estimators': [10, 50, 100, 300, 500, 600, 700, 800],
               'criterion': ['gini', 'entropy'],
               'max_depth': [2, 4, 6, 8, 10, 15, 20, None],
               #'oob_score': [True]
              }]
gs_rf = GridSearchCV(estimator = RandomForestClassifier(n_jobs=-1),
                     param_grid=params,
                     scoring='accuracy',
                     cv=5)
gs_rf.fit(x_train_num, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_depth': [2, 4, 6, 8, 10, 15, 20, None],
                          'n_estimators': [10, 50, 100, 300, 500, 600, 700,
                                           800]}],
             scoring='accuracy')

In [15]:
params = [{'C': [0.1,1, 10, 100],
              'gamma': [1,0.1,0.01,0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']
             }]
gs_svc = GridSearchCV(estimator = SVC(),
                      param_grid=params,
                      scoring='accuracy',
                      cv=5)
#gs_svc.fit(x_train_num, y_train)

In [16]:
params = [{'metric' : ['minkowski','euclidean','manhattan'],
               'weights' : ['uniform','distance'],
               'n_neighbors'  : np.arange(5,15)}]
gs_knn = GridSearchCV(estimator = KNeighborsClassifier(n_jobs=-1),
                          param_grid=params,
                          scoring='accuracy',
                          cv=5)
gs_knn.fit(x_train_num, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(n_jobs=-1),
             param_grid=[{'metric': ['minkowski', 'euclidean', 'manhattan'],
                          'n_neighbors': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                          'weights': ['uniform', 'distance']}],
             scoring='accuracy')

In [17]:
gs_rf.best_params_

{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}

In [18]:
gs_knn.best_params_

{'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}

In [19]:
rf = RandomForestClassifier(criterion= 'gini', max_depth= 4, n_estimators= 300)
rf.fit(x_train_num, y_train)

RandomForestClassifier(max_depth=4, n_estimators=300)

In [20]:
rf = RandomForestClassifier(criterion= 'gini', max_depth= 8, n_estimators= 300)
knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors= 12, weights = 'distance')

In [21]:
i = 1


for train_index, test_index in skf.split(X_train, y_train):
    X_train_cv = X_train.loc[train_index,]
    y_train_cv = y_train[train_index]
    X_test_cv = X_train.loc[test_index,]
    y_test_cv = y_train[test_index]

    X_train_cv, X_test_cv = embarked_missing(X_train_cv, x_test = X_test_cv)
    X_train_cv, X_test_cv = age_missing(X_train_cv, x_test= X_test_cv)

    x_train_num = pd.get_dummies(X_train_cv, columns=['Embarked', 'Sex'], drop_first = True)
    #x_train_num.drop(['Embarked_C', 'Sex_female'], axis = 1, inplace = True)

    x_test_num = pd.get_dummies(X_test_cv, columns=['Embarked', 'Sex'], drop_first= True)
    #x_test_num.drop(['Embarked_C', 'Sex_female'], axis = 1, inplace = True)

    feat = ['Age', 'Fare', 'SibSp', 'Parch']
    x_train_scaled = x_train_num.copy(deep = True)
    x_train_scaled[feat] = ss.fit_transform(x_train_scaled[feat])

    x_test_scaled = x_test_num.copy(deep = True)
    x_test_scaled[feat] = ss.fit_transform(x_test_scaled[feat])

    rf.fit(x_train_num, y_train_cv)
    score_rf = rf.score(x_test_num, y_test_cv)
    mla_compare.loc[5,'MLA Name'] = 'Random Forest GS'
    mla_compare.loc[5,f'MLA Accuracy{i}'] = score_rf

    knn.fit(x_train_scaled, y_train_cv)
    score_knn = knn.score(x_test_num, y_test_cv)
    mla_compare.loc[6,'MLA Name'] = 'KNN'
    mla_compare.loc[6,f'MLA Accuracy{i}'] = score_knn

    i += 1

In [22]:
mla_compare

Unnamed: 0,MLA Name,MLA Accuracy1,MLA Accuracy2,MLA Accuracy3,MLA Accuracy4,MLA Accuracy5
0,Logistic Regression,0.781513,0.798319,0.840336,0.789916,0.779661
1,Random Forest,0.823529,0.815126,0.840336,0.781513,0.79661
2,Decision Tree,0.773109,0.831933,0.764706,0.714286,0.754237
3,Support Vector Machine Classifier,0.781513,0.798319,0.840336,0.789916,0.779661
4,Gaussian Naives Bayes,0.773109,0.756303,0.764706,0.781513,0.762712
5,Random Forest GS,0.823529,0.798319,0.882353,0.798319,0.788136
6,KNN,0.613445,0.680672,0.747899,0.638655,0.728814


In [23]:
#random forest seems to be working best.
#testing random forest on test data.
rf = RandomForestClassifier(criterion= 'gini', max_depth= 4, n_estimators= 300)
#X_train, X_test, y_train, y_test
X_train, X_test = embarked_missing(X_train, x_test = X_test)
X_train, X_test = age_missing(X_train, x_test = X_test)
x_train_num = pd.get_dummies(X_train, columns=['Embarked', 'Sex'], drop_first = True)
x_test_num = pd.get_dummies(X_test, columns=['Embarked', 'Sex'], drop_first = True)
rf.fit(x_train_num, y_train)
rf.score(x_test_num, y_test)

0.8338983050847457

In [24]:
#checking out the actual test data
missing_df(test)

missing values in these columns:


Unnamed: 0,Feature,dtype,Missing Values,%age missing
0,Age,float64,86,0.21
1,Fare,float64,1,0.0


In [25]:
missing_df(df_x)

missing values in these columns:


Unnamed: 0,Feature,dtype,Missing Values,%age missing
0,Age,float64,177,0.2
1,Embarked,object,2,0.0


In [26]:
def fare_missing(x_train, x_test):

    x_train['age_grp'] = x_train.apply(age_class, axis = 1)
    #x_train['fare_grp'] = x_train.apply(fare_seg, axis = 1)

    try:
        x_test['age_grp'] = x_test.apply(age_class, axis = 1)
        #x_test['fare_grp'] = x_test.apply(fare_seg, axis = 1)

        miss_val = x_test[x_test.Fare.isnull()]  #now filling values in test dataset

        for i in miss_val.index:
            x = x_train[(x_train.Sex == miss_val.loc[i,'Sex']) &  #sex
                    (x_train.Pclass == miss_val.loc[i,'Pclass']) &  #plcass
                    (x_train.age_grp == miss_val.loc[i,'age_grp']) &
                    (x_train.Embarked == miss_val.loc[i,'Embarked'])].Fare.mean()
            x_test.loc[i,'Fare'] = x
        
        x_test.drop(['age_grp'], axis = 1, inplace = True)
        
    except:
        pass
    '''    
    miss_val = x_train[x_train.Embarked.isnull()]  #assigning x_train missing indices

    for i in miss_val.index:
        x = x_train[(x_train.Sex == miss_val.loc[i,'Sex']) &  #sex
                (x_train.Pclass == miss_val.loc[i,'Pclass']) &  #plcass
                (x_train.fare_grp == miss_val.loc[i,'fare_grp'])].Embarked.mode() #same fare segment
        x_train.loc[i,'Embarked'] = x[0]
        
    x_train.drop(['age_grp', 'fare_grp'], axis = 1, inplace = True)
    '''
    x_train.drop(['age_grp'], axis = 1, inplace = True)
    return x_test

In [27]:
df_x,_ = embarked_missing(x_train = df_x)
test = fare_missing(df_x, test)
df_x, test = age_missing(df_x, test)

print(missing_df(test))

print("***"*40)
print(missing_df(df_x))

missing values in these columns:
Empty DataFrame
Columns: [Feature, dtype, Missing Values, %age missing]
Index: []
************************************************************************************************************************
missing values in these columns:
Empty DataFrame
Columns: [Feature, dtype, Missing Values, %age missing]
Index: []


In [28]:
df_x_num = pd.get_dummies(df_x, columns=['Embarked', 'Sex'], drop_first = True)
test_num = pd.get_dummies(test, columns=['Embarked', 'Sex'], drop_first = True)
rf.fit(df_x_num, df_y)
predictions = rf.predict(test_num)


In [29]:
test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [30]:
submission = pd.DataFrame()
submission['PassengerId'] = test1['PassengerId']
submission['Survived'] = predictions # our model predictions on the test dataset
submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [31]:
submission.to_csv('../submission.csv', index=False)