In [120]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [121]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')
df_targets = pd.read_csv('dataset/gender_submission.csv')

In [122]:
#Now we are going to predict the CabinLetter values for the NaN examples. We are gonna use the Pclass and Fare as a independent variables(X) to estimates
#CabinLetter as a dependent variable(y) using Logistic Regression.
def predict_nan_cabin(df, model, test):
   
    if not test:
        X = df[df['Cabin'].notnull()][['Pclass', 'Fare', 'Relatives', 'NumTicket']]
        y = df[df['Cabin'].notnull()]['Cabin']
        model.fit(X, y)

    nan_inputs = df[df['Cabin'].isnull()][['Pclass', 'Fare', 'Relatives', 'NumTicket']]

    nan_pred = model.predict(nan_inputs)

    return nan_pred

In [123]:

def preprocess_inputs(df, standardScaler, minMaxScaler, model, data, test = False):

 
    # Obtain mean male and female age
    if not test:
        data['mean_male_age'] = df[df['Sex'] == 'male']['Age'].mean()
        data['mean_female_age'] = df[df['Sex'] == 'female']['Age'].mean()
        data['mean_fare'] = df['Fare'].mean()

    # Replace NaN values with mean value in 'Fare' column
    df.loc[df['Fare'].isnull(), 'Fare'] = data['mean_fare']

    # Replace NaN values with mean values in 'Age' column
    df.loc[(df['Sex'] == 'male') & (df['Age'].isnull()), 'Age'] = data['mean_male_age']
    df.loc[(df['Sex'] == 'female') & (df['Age'].isnull()), 'Age'] = data['mean_female_age']

    # Codify categorical 'Sex' feature
    df.loc[(df['Sex'] == 'male'), 'Sex'] = 0
    df.loc[(df['Sex'] == 'female'), 'Sex'] = 1

    # Parse TicketNumber
    df['NumTicket'] = df['Ticket'].str.split().str[-1]
    df.loc[(df['NumTicket'] == 'LINE'), 'NumTicket'] = -1

    # Generate new feature 'Relatives' which is the sum of 'SibSp' and 'Parch'
    df['Relatives'] = df['SibSp'] + df['Parch']

    # Codify categorical 'Enbarked' feature with OneHotEncoding
    df = pd.get_dummies(df, columns=['Embarked'])

    # Parse cabin letter
    df['Cabin'] = df['Cabin'].str[0]

    nan_pred = predict_nan_cabin(df, model, test)
    df.loc[df['Cabin'].isnull(), 'Cabin'] = nan_pred

    df = pd.get_dummies(df, columns=['Cabin'])

    if test:
        df['Cabin_T'] = False

    # Normalize Age and Fare features with Standard Scaler
    if test:
        df[['Age', 'Fare']] = standardScaler.transform(df[['Age', 'Fare']])
        df[['Pclass', 'Relatives']] = minMaxScaler.transform(df[['Pclass', 'Relatives']])
    else:
        df[['Age', 'Fare']] = standardScaler.fit_transform(df[['Age', 'Fare']])
        df[['Pclass', 'Relatives']] = minMaxScaler.fit_transform(df[['Pclass', 'Relatives']])

    if test:
        df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
    else:
        df = df.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)

    return df


In [124]:
def extract_targets(df):
    return df['Survived']

In [125]:
standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()
lr = LogisticRegression(multi_class='ovr', max_iter=1000)
data = {}

train_inputs = preprocess_inputs(df_train, standardScaler, minMaxScaler, lr, data)
train_targets = extract_targets(df_train)

test_inputs = preprocess_inputs(df_test, standardScaler, minMaxScaler, lr, data, True)

test_inputs



Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,NumTicket,Relatives,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,1.0,0,0.366245,0,0,-0.490783,330911,0.0,False,True,False,False,False,False,False,False,True,False,False
1,1.0,1,1.327222,1,0,-0.507479,363272,0.1,False,False,True,False,False,False,False,False,True,False,False
2,0.5,0,2.480395,0,0,-0.453367,240276,0.0,False,True,False,False,False,False,False,True,False,False,False
3,1.0,0,-0.210341,0,0,-0.474005,315154,0.0,False,False,True,False,False,False,False,False,True,False,False
4,1.0,1,-0.594732,1,1,-0.401017,3101298,0.2,False,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1.0,0,0.076156,0,0,-0.486337,3236,0.0,False,False,True,False,False,False,True,False,False,False,False
414,0.0,1,0.712197,0,0,1.544246,17758,0.0,True,False,False,False,False,True,False,False,False,False,False
415,1.0,0,0.673758,0,0,-0.502445,3101262,0.0,False,False,True,False,False,False,False,False,True,False,False
416,1.0,0,0.076156,0,0,-0.486337,359309,0.0,False,False,True,False,False,False,False,False,True,False,False


In [126]:
model = RandomForestClassifier()

model.fit(train_inputs, train_targets)

predictions = model.predict(test_inputs)
pred_df = pd.DataFrame(predictions, columns=['Survived'])

pass_df = df_test['PassengerId']
submission_df = pd.concat([pass_df, pred_df], axis = 1)
submission_df.to_csv('submission.csv', index=False)

In [127]:
submission_df.to_csv('submission.csv', index=False)