In [253]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [254]:
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings('ignore')

In [255]:
train = pd.read_csv('train.csv')
train, test = train_test_split(train, test_size = 0.2)
train_ = train.copy()

X = train_[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Name', 'Embarked']]
y = train_[['Survived']]

In [256]:
#Fill in NA values for Age Column, based on Suffix
def fillin_age(df, dic):
    df.Age = df.Age.fillna(df.Suffix.map(dic))

In [257]:
#Data preparation Function
def prepare_data(X):
    #Fill in NA values for Age, based on Suffix
    X['Suffix']= X['Name'].str.findall(r'Miss\.|Mr\.|Mrs\.|Master\.|')
    X['Suffix'] = X['Suffix'].apply(lambda x: x[0] if len(x)!=0 else np.NaN)
    suffix_age_dict = X.groupby(['Suffix'])['Age'].median().to_dict()
    suffix_age_dict['Normal'] = X['Age'].median()
    X['Age_null'] = X['Age'].isnull()
    X['Suffix'] = X['Suffix'].fillna('Normal')
    fillin_age(X, suffix_age_dict)
    X = X.drop(['Name', 'Suffix', 'Age_null'], axis = 1)
    
    #Fill in NA values for Embarked
    X['Embarked'] = X['Embarked'].fillna('S')
    #Add a column indicate whether Embarked == 'S'
    X['S'] = (X['Embarked'] == 'S').astype(np.int)
    X = X.drop('Embarked', axis = 1)
    
    #Add a feature for family size
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    
    #Add a feature for whether the person is alone on the Titanic or not 
    X['IsAlone'] = 1 
    X['IsAlone'].loc[X['FamilySize'] > 1] = 0 
    
    #Add a feature for class 1 female
    X.loc[:, '1female'] = ((X['Sex'] == 'female') & (X['Pclass'] == 1)).astype(np.int)
    
    #Add a feature for class 3 male
    X.loc[:, '3male'] = ((X['Sex'] == 'male') & (X['Pclass'] == 3)).astype(np.int)
    
    #Add a feature for class1,2 mother
    X['mother'] = ((X['Parch'] >= 1) & (X['Sex'] == 'female') & (X['Age'] >= 18) & (X['Pclass'] != 3)).astype(np.int)
    
    #Create Column Transformer Pipeline
    pl = ColumnTransformer([('1hot', OneHotEncoder(), ['Sex']),('scaler', StandardScaler(), ['Fare'])], 
                           remainder ='passthrough')
    X_prepared = pl.fit_transform(X)
    return X_prepared

In [258]:
X_prepared = prepare_data(X)

In [259]:
# log_reg = LogisticRegression(C = 0.1,tol = 1e-6)
log_reg = XGBClassifier()
y = y['Survived'].tolist()

log_reg.fit(X_prepared, y)

XGBClassifier()

In [260]:
X_test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Name', 'Embarked']]
y_test = test[['Survived']]['Survived'].tolist()

test_prepared = prepare_data(X_test)

In [261]:
predictions = log_reg.predict(test_prepared)

In [262]:
accuracy_score(predictions, y_test)

0.8156424581005587

In [263]:
confusion_matrix(predictions, y_test)

array([[101,  25],
       [  8,  45]])

In [264]:
recall_score(predictions, y_test)

0.8490566037735849

In [265]:
precision_score(predictions, y_test)

0.6428571428571429

In [266]:
f1_score(predictions, y_test)

0.7317073170731708

## Predictions

In [267]:
test_data = pd.read_csv('test.csv')
test_data['Fare'][152] = test_data['Fare'].median()

In [268]:
X_test_ = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare', 'Name', 'Embarked']]

In [269]:
X_test_prepared_ = prepare_data(X_test_)

In [270]:
predictions = log_reg.predict(X_test_prepared_)
result = pd.DataFrame({'PassengerId':test_data.PassengerId.tolist(), 'Survived':log_reg.predict(X_test_prepared_).tolist()})

In [271]:
result.to_csv('result8.csv',index = False)