In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
train_filepath = '/kaggle/input/titanic/train.csv'
test_filepath = '/kaggle/input/titanic/test.csv'

In [3]:
df_train = pd.read_csv(train_filepath)
df_test = pd.read_csv(test_filepath)

In [4]:
fill_fare_mean = df_test['Fare'].mean()
fill_fare_med = df_test['Fare'].median()

In [5]:
drop_feature_names = ['Name', 'Cabin', 'Ticket']

In [6]:
map_features = ['Sex', 'Embarked']
map_dicts = [{'male':0, 'female':1}, {'Q':0, 'C':1, 'S':2}]

In [7]:
models = []

models.append(('SVM', SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('LR', LogisticRegression()))

In [8]:
def drop_features(df, d_features):
    df_dropped = df
    
    for feature in d_features:
        df_dropped = df_dropped.drop(str(feature), axis=1)
        
    return df_dropped

In [9]:
def map_cat(df, features, dictionaries):
    '''
    Map categorical data to be numeric
    
    Input:
        - Dataframe for the mapping to take place in
        - List of features that contain categorical data
        - List of dictionaries to govern how to map the feature
    
    Output: A dataframe with the given features mapped according to the dictionaries
        
    '''
    df_mapped = df
    
    for i in range(len(features)):
        feature = features[i]
        dictionary = dictionaries[i]
        
        df_mapped[feature] = df[feature].map(dictionary)
        
    return df_mapped

In [10]:
def normalize(feature):
    '''
    This function normalizes a Series
    
    Input: A feature of type Series
    
    Output: The normalized feature of type Series
    '''
    return (feature - feature.mean())/feature.std()

In [11]:
def normalize_features(df):
    '''
    This function normalizes all features in a dataframe
    
    Input: A pandas dataframe
    
    Output: The normalized dataframe
    '''
    for column in df.columns:
        df[column] = normalize(df[column])
    return df

In [12]:
def perform_cross_validation(X_train, y_train):
    '''
    This function goes through the steps to perform Stratified K-fold cross validation using the list of models described above.
    
    Input: 
        - A dataframe containing the features use to build the model
        - A Series of the true values associated with the feature list
    
    Output: Printed result for the mean and standard deviation of each model
    '''
    results = dict()

    for name, model in models:
        kfold = StratifiedKFold(n_splits=10)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results[name] = (cv_results.mean(), cv_results.std())

    print('Model\t\tCV Mean\t\tCV std')
    print(results)

In [13]:
def prepare_dataframe(df, d_features, fill_fare):
    df_dropped = drop_features(df, d_features)
    df_dropped['Fare'] = df_dropped['Fare'].fillna(fill_fare)
    df_dropped['Embarked'] = df_dropped['Embarked'].fillna('S')
    df_mapped = map_cat(df_dropped, map_features, map_dicts)
    return df_mapped

In [14]:
df_train_noAge = pd.read_csv(train_filepath, index_col='PassengerId')
df_test_noAge = pd.read_csv(test_filepath, index_col='PassengerId')

In [15]:
drop_feature_names_w_age = drop_feature_names + ['Age']

In [16]:
df_train_noAge = prepare_dataframe(df_train_noAge, drop_feature_names_w_age, fill_fare_mean)
df_test_noAge = prepare_dataframe(df_test_noAge, drop_feature_names_w_age, fill_fare_mean)

In [17]:
X_noAge = df_train_noAge.drop('Survived', axis=1)
y_noAge = df_train_noAge['Survived']

In [18]:
X_noAge = normalize_features(X_noAge)
df_test_noAge = normalize_features(df_test_noAge)

In [19]:
perform_cross_validation(X_noAge, y_noAge)

Model		CV Mean		CV std
{'SVM': (0.8058052434456927, 0.02817776516195646), 'KNN': (0.8092384519350813, 0.04749914265913915), 'LR': (0.7934706616729088, 0.024861946815253916)}


In [20]:
clf_noAge = KNeighborsClassifier().fit(X_noAge,y_noAge)

predictions_noAge = clf_noAge.predict(df_test_noAge)

submission1_noAge = pd.DataFrame(data={'Survived':predictions_noAge}, index=df_test_noAge.index)

submission1_noAge.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [21]:
submission1_noAge.to_csv('Titanic_Submission-No_Age-Mean_Fare-2023_6_14.csv')

In [22]:
df_train_noAge = pd.read_csv(train_filepath, index_col='PassengerId')
df_test_noAge = pd.read_csv(test_filepath, index_col='PassengerId')

In [23]:
drop_feature_names_w_age = drop_feature_names + ['Age']

In [24]:
df_train_noAge = prepare_dataframe(df_train_noAge, drop_feature_names_w_age, fill_fare_med)
df_test_noAge = prepare_dataframe(df_test_noAge, drop_feature_names_w_age, fill_fare_med)

In [25]:
X_noAge = df_train_noAge.drop('Survived', axis=1)
y_noAge = df_train_noAge['Survived']

In [26]:
X_noAge = normalize_features(X_noAge)
df_test_noAge = normalize_features(df_test_noAge)

In [27]:
perform_cross_validation(X_noAge, y_noAge)

Model		CV Mean		CV std
{'SVM': (0.8058052434456927, 0.02817776516195646), 'KNN': (0.8092384519350813, 0.04749914265913915), 'LR': (0.7934706616729088, 0.024861946815253916)}


In [28]:
clf_noAge = KNeighborsClassifier().fit(X_noAge,y_noAge)

predictions_noAge = clf_noAge.predict(df_test_noAge)

submission1_noAge = pd.DataFrame(data={'Survived':predictions_noAge}, index=df_test_noAge.index)

submission1_noAge.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [29]:
submission1_noAge.to_csv('Titanic_Submission-No_Age-Med_Fare-2023_6_14.csv')