In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
train_filepath = '/kaggle/input/titanic/train.csv'
test_filepath = '/kaggle/input/titanic/test.csv'

In [3]:
df_train = pd.read_csv(train_filepath)
df_test = pd.read_csv(test_filepath)

In [4]:
fill_fare_mean = df_test['Fare'].mean()
fill_fare_med = df_test['Fare'].median()

In [5]:
age_mean_train = df_train['Age'].mean()
age_med_train = df_train['Age'].median()
age_mode_train = df_train['Age'].mode()

age_mean_test = df_test['Age'].mean()
age_med_test = df_test['Age'].median()
age_mode_test = df_test['Age'].mode()

In [6]:
drop_feature_names = ['Name', 'Cabin', 'Ticket']

In [7]:
map_features = ['Sex', 'Embarked']
map_dicts = [{'male':0, 'female':1}, {'Q':0, 'C':1, 'S':2}]

In [8]:
models = []

models.append(('SVM', SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LR', LogisticRegression()))

In [9]:
def drop_features(df, d_features):
    df_dropped = df
    
    for feature in d_features:
        df_dropped = df_dropped.drop(str(feature), axis=1)
        
    return df_dropped

In [10]:
def map_cat(df, features, dictionaries):
    '''
    Map categorical data to be numeric
    
    Input:
        - Dataframe for the mapping to take place in
        - List of features that contain categorical data
        - List of dictionaries to govern how to map the feature
    
    Output: A dataframe with the given features mapped according to the dictionaries
        
    '''
    df_mapped = df
    
    for i in range(len(features)):
        feature = features[i]
        dictionary = dictionaries[i]
        
        df_mapped[feature] = df[feature].map(dictionary)
        
    return df_mapped

In [11]:
def normalize(feature):
    '''
    This function normalizes a Series
    
    Input: A feature of type Series
    
    Output: The normalized feature of type Series
    '''
    return (feature - feature.mean())/feature.std()

In [12]:
def normalize_features(df):
    '''
    This function normalizes all features in a dataframe
    
    Input: A pandas dataframe
    
    Output: The normalized dataframe
    '''
    for column in df.columns:
        df[column] = normalize(df[column])
    return df

In [13]:
def perform_cross_validation(X_train, y_train):
    '''
    This function goes through the steps to perform Stratified K-fold cross validation using the list of models described above.
    
    Input: 
        - A dataframe containing the features use to build the model
        - A Series of the true values associated with the feature list
    
    Output: Printed result for the mean and standard deviation of each model
    '''
    results = dict()

    for name, model in models:
        kfold = StratifiedKFold(n_splits=10)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results[name] = (cv_results.mean(), cv_results.std())

    print('Model\t\tCV Mean\t\tCV std')
    print(results)

In [14]:
def fill_na(df, features, vals):
    '''
    Fill the NaN values of the features with the fill values
    
    Input:
        - Dataframe to fill the missing values in
        - A list of features to look for missing values
        - A list of values to fill the missing values of the feature with
    
    Output: A dataframe with the found missing values filled
    '''
    df_filled = df
    
    for i in range(len(features)):
        feature = features[i]
        fill_val = vals[i]
        
        df_filled[feature] = df_filled[feature].fillna(fill_val)
        
    return df_filled

In [15]:
def prepare_dataframe(df, d_features, fill_fare, fill_age=-1):
    fill_features = ['Fare', 'Embarked', 'Age']
    fill_vals = [fill_fare, 'S', fill_age]
    df_filled = fill_na(df, fill_features, fill_vals)
    
    df_dropped = drop_features(df_filled, d_features)
    df_mapped = map_cat(df_dropped, map_features, map_dicts)
    return df_mapped

In [16]:
df_train_noAge = pd.read_csv(train_filepath, index_col='PassengerId')
df_test_noAge = pd.read_csv(test_filepath, index_col='PassengerId')

In [17]:
drop_feature_names_w_age = drop_feature_names + ['Age']

In [18]:
df_train_noAge = prepare_dataframe(df_train_noAge, drop_feature_names_w_age, fill_fare_mean)
df_test_noAge = prepare_dataframe(df_test_noAge, drop_feature_names_w_age, fill_fare_mean)

In [19]:
X_noAge = df_train_noAge.drop('Survived', axis=1)
y_noAge = df_train_noAge['Survived']

In [20]:
X_noAge = normalize_features(X_noAge)
df_test_noAge = normalize_features(df_test_noAge)

In [21]:
perform_cross_validation(X_noAge, y_noAge)

Model		CV Mean		CV std
{'SVM': (0.8058052434456927, 0.02817776516195646), 'KNN': (0.8092384519350813, 0.04749914265913915), 'LR': (0.7934706616729088, 0.024861946815253916)}


In [22]:
clf_noAge = KNeighborsClassifier().fit(X_noAge,y_noAge)

predictions_noAge = clf_noAge.predict(df_test_noAge)

submission1_noAge = pd.DataFrame(data={'Survived':predictions_noAge}, index=df_test_noAge.index)

submission1_noAge.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [23]:
submission1_noAge.to_csv('Titanic_Submission-No_Age-Mean_Fare-2023_6_14.csv')

In [24]:
df_train_noAge = pd.read_csv(train_filepath, index_col='PassengerId')
df_test_noAge = pd.read_csv(test_filepath, index_col='PassengerId')

In [25]:
drop_feature_names_w_age = drop_feature_names + ['Age']

In [26]:
df_train_noAge = prepare_dataframe(df_train_noAge, drop_feature_names_w_age, fill_fare_med)
df_test_noAge = prepare_dataframe(df_test_noAge, drop_feature_names_w_age, fill_fare_med)

In [27]:
X_noAge = df_train_noAge.drop('Survived', axis=1)
y_noAge = df_train_noAge['Survived']

In [28]:
X_noAge = normalize_features(X_noAge)
df_test_noAge = normalize_features(df_test_noAge)

In [29]:
perform_cross_validation(X_noAge, y_noAge)

Model		CV Mean		CV std
{'SVM': (0.8058052434456927, 0.02817776516195646), 'KNN': (0.8092384519350813, 0.04749914265913915), 'LR': (0.7934706616729088, 0.024861946815253916)}


In [30]:
clf_noAge = KNeighborsClassifier().fit(X_noAge,y_noAge)

predictions_noAge = clf_noAge.predict(df_test_noAge)

submission2_noAge = pd.DataFrame(data={'Survived':predictions_noAge}, index=df_test_noAge.index)

submission2_noAge.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [31]:
submission2_noAge.to_csv('Titanic_Submission-No_Age-Med_Fare-2023_6_14.csv')

In [32]:
df_train_meanAge = pd.read_csv(train_filepath, index_col='PassengerId')
df_test_meanAge = pd.read_csv(test_filepath, index_col='PassengerId')

In [33]:
df_train_meanAge = prepare_dataframe(df_train_meanAge, drop_feature_names, fill_fare_mean, age_mean_train)
df_test_meanAge = prepare_dataframe(df_test_meanAge, drop_feature_names, fill_fare_mean, age_mean_test)

In [34]:
df_train_meanAge.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,0,22.0,1,0,7.25,2
2,1,1,1,38.0,1,0,71.2833,1
3,1,3,1,26.0,0,0,7.925,2
4,1,1,1,35.0,1,0,53.1,2
5,0,3,0,35.0,0,0,8.05,2


In [35]:
X_meanAge = df_train_meanAge.drop('Survived', axis=1)
y_meanAge = df_train_meanAge['Survived']

In [36]:
X_meanAge = normalize_features(X_meanAge)
df_test_meanAge = normalize_features(df_test_meanAge)

In [37]:
perform_cross_validation(X_meanAge, y_meanAge)

Model		CV Mean		CV std
{'SVM': (0.8249313358302122, 0.03690856840480252), 'KNN': (0.809250936329588, 0.044119166613680365), 'LR': (0.7946192259675404, 0.02242820343899094)}


In [38]:
clf_meanAge = SVC().fit(X_meanAge, y_meanAge)

predictions_meanAge = clf_meanAge.predict(df_test_meanAge)

submission1_meanAge = pd.DataFrame(data={'Survived':predictions_meanAge}, index=df_test_meanAge.index)

submission1_meanAge.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [39]:
submission1_meanAge.to_csv('Titanic_Submission-Mean_Age-2023_6_14.csv')

In [40]:
df_train_medAge = pd.read_csv(train_filepath, index_col='PassengerId')
df_test_medAge = pd.read_csv(test_filepath, index_col='PassengerId')

In [41]:
df_train_medAge = prepare_dataframe(df_train_medAge, drop_feature_names, fill_fare_mean, age_med_train)
df_test_medAge = prepare_dataframe(df_test_medAge, drop_feature_names, fill_fare_mean, age_med_test)

In [42]:
X_medAge = df_train_medAge.drop('Survived', axis=1)
y_medAge = df_train_medAge['Survived']

In [43]:
X_medAge = normalize_features(X_medAge)
df_test_meanAge = normalize_features(df_test_medAge)

In [44]:
perform_cross_validation(X_medAge, y_medAge)

Model		CV Mean		CV std
{'SVM': (0.8249313358302122, 0.03690856840480252), 'KNN': (0.8114731585518102, 0.04093208033293136), 'LR': (0.7946192259675405, 0.02702347765082892)}


In [45]:
clf_medAge = SVC().fit(X_medAge, y_medAge)

predictions_medAge = clf_medAge.predict(df_test_medAge)

submission1_medAge = pd.DataFrame(data={'Survived':predictions_medAge}, index=df_test_medAge.index)

submission1_medAge.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [46]:
submission1_medAge.to_csv('Titanic_Submission-Median_Age-2023_6_14.csv')

In [47]:
df_train_modeAge = pd.read_csv(train_filepath, index_col='PassengerId')
df_test_modeAge = pd.read_csv(test_filepath, index_col='PassengerId')

In [48]:
df_train_modeAge = prepare_dataframe(df_train_modeAge, drop_feature_names, fill_fare_mean, 24)
df_test_modeAge = prepare_dataframe(df_test_modeAge, drop_feature_names, fill_fare_mean, 24)

In [49]:
X_modeAge = df_train_modeAge.drop('Survived', axis=1)
y_modeAge = df_train_modeAge['Survived']

In [50]:
X_modeAge = normalize_features(X_modeAge)
df_test_modeAge = normalize_features(df_test_modeAge)

In [51]:
perform_cross_validation(X_modeAge, y_modeAge)

Model		CV Mean		CV std
{'SVM': (0.8260549313358302, 0.03793120045875092), 'KNN': (0.8070037453183521, 0.03824776974876488), 'LR': (0.7923720349563046, 0.028051434354094185)}


In [52]:
clf_modeAge = SVC().fit(X_modeAge, y_modeAge)

predictions_modeAge = clf_modeAge.predict(df_test_modeAge)

submission1_modeAge = pd.DataFrame(data={'Survived':predictions_medAge}, index=df_test_modeAge.index)

submission1_modeAge.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [53]:
submission1_modeAge.to_csv('Titanic_Submission-Mode_Age-2023_6_14.csv')