In [250]:
import pandas as pd
import numpy as np

In [251]:
train_data = pd.read_csv('./data/train.csv')

In [252]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [253]:
test_data = pd.read_csv('./data/test.csv')

In [254]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [255]:
test_data.loc[test_data.Fare.isna()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [256]:
test_part = test_data[['Pclass', 'Sex', 'Fare', 'Embarked']]

In [257]:
test_part.head()

Unnamed: 0,Pclass,Sex,Fare,Embarked
0,3,male,7.8292,Q
1,3,female,7.0,S
2,2,male,9.6875,Q
3,3,male,8.6625,S
4,3,female,12.2875,S


In [258]:
grp = test_part.groupby(['Sex', 'Pclass', 'Embarked']).median()

In [259]:
grp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Fare
Sex,Pclass,Embarked,Unnamed: 3_level_1
female,1,C,81.17915
female,1,Q,90.0
female,1,S,61.175
female,2,C,32.3625
female,2,S,24.5
female,3,C,11.48545
female,3,Q,7.75
female,3,S,12.2875
male,1,C,73.2625
male,1,S,42.4


In [260]:
test_data.Fare.fillna(7.98750, inplace=True)

In [261]:
test_data.iloc[152]

PassengerId                  1044
Pclass                          3
Name           Storey, Mr. Thomas
Sex                          male
Age                          60.5
SibSp                           0
Parch                           0
Ticket                       3701
Fare                       7.9875
Cabin                         NaN
Embarked                        S
Name: 152, dtype: object

In [262]:
def prepare_data(data):
    data.Embarked = data.Embarked.fillna('S')
    data.Cabin = data.Cabin.fillna("n/a")
    
    data['Family_Count'] = data['SibSp'] + data['Parch']
    data['Fare_Category'] = pd.cut(data['Fare'], bins = [-np.inf, 7.9104, 14.454200, 31, np.Inf], labels=['low', 'mid_low', 'mid_high', 'high'])
    train = data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'SibSp', 'Parch'], axis=1)
    
    data_sex = data[['Sex']]
    data_fare = data[['Fare_Category']]
    
    from sklearn.preprocessing import OrdinalEncoder

    sex_encoder = OrdinalEncoder()
    data_sex_encoded = sex_encoder.fit_transform(data_sex) #male = 1, female = 0
    fare_encoder = OrdinalEncoder(categories=[['low', 'mid_low', 'mid_high', 'high']])

    data_fare_encoded = fare_encoder.fit_transform(data_fare)
    
    train['Sex'] = data_sex_encoded
    
    train['Fare_Category'] = data_fare_encoded
    
    from sklearn.preprocessing import LabelBinarizer

    embarked_encoder = LabelBinarizer()
    data_embarked = data['Embarked']
    data_embarked_onehot = embarked_encoder.fit_transform(data_embarked)
    data_embarked_df = pd.DataFrame(data_embarked_onehot)
    data_embarked_df = data_embarked_df.rename(columns={0: "Embarked_C", 1: "Embarked_Q", 2: "Embarked_S"})
    train = pd.concat([train, data_embarked_df], axis=1).drop(['Embarked'], axis=1)    
    
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer

    imputer = IterativeImputer(random_state=42, max_iter=10)
    imputer.fit(train[['Age']])
    X = imputer.transform(train[['Age']])
    age_df = pd.DataFrame(X, columns=['Ages'], index=train[['Age']].index)
    train = pd.concat([train, age_df], axis=1).drop(['Age'], axis=1)
    
    train['Age_Category'] = pd.cut(train['Ages'], bins=[-np.inf, 3, 16, 30, 45, np.inf], labels=['baby', 'child', 'young_adult', 'middle-age_adult', 'senior'])
    train = train.drop(labels=['Ages'], axis=1)
    
    age_encoder = OrdinalEncoder(categories=[['baby', 'child', 'young_adult', 'middle-age_adult', 'senior']])
    train_age_encoded = age_encoder.fit_transform(train[['Age_Category']])
    train['Age_Category'] = train_age_encoded
    
    return train
    

In [263]:
def prepare_train_data(data):
    x_train = prepare_data(data)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0,1))
    x_train[['Pclass', 'Family_Count', 'Fare_Category', 'Age_Category']] = scaler.fit_transform(x_train[['Pclass', 'Family_Count', 'Fare_Category', 'Age_Category']])
    x_train = x_train.drop(['Survived'], axis=1)
    
    y_train = data['Survived']
    
    return x_train, y_train

In [264]:
def prepare_test_data(train_data, test_data):
    train = prepare_data(train_data)
    x_test = prepare_data(test_data)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0,1))
    train[['Pclass', 'Family_Count', 'Fare_Category', 'Age_Category']] = scaler.fit(train[['Pclass', 'Family_Count', 'Fare_Category', 'Age_Category']])
    x_test[['Pclass', 'Family_Count', 'Fare_Category', 'Age_Category']] = scaler.transform(x_test[['Pclass', 'Family_Count', 'Fare_Category', 'Age_Category']])
    
    return x_test

In [265]:
x_train, y_train = prepare_train_data(train_data)

In [266]:
x_train.head()

Unnamed: 0,Pclass,Sex,Family_Count,Fare_Category,Embarked_C,Embarked_Q,Embarked_S,Age_Category
0,1.0,1.0,0.1,0.0,0,0,1,0.5
1,0.0,0.0,0.1,1.0,1,0,0,0.75
2,1.0,0.0,0.0,0.333333,0,0,1,0.5
3,0.0,0.0,0.1,1.0,0,0,1,0.75
4,1.0,1.0,0.0,0.333333,0,0,1,0.75


In [267]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=42)

In [268]:
rfc.fit(x_train, y_train)

In [269]:
from sklearn.model_selection import cross_val_score

cross_val_score(rfc, x_train, y_train, cv = 10, scoring="accuracy").mean()

0.8115230961298376

In [270]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [271]:
cross_val_score(gnb, x_train, y_train, cv = 10, scoring="accuracy").mean()

0.768876404494382

In [272]:
from sklearn.naive_bayes import BernoulliNB

cnb = BernoulliNB()
cnb.fit(x_train, y_train)

In [273]:
cross_val_score(cnb, x_train, y_train, cv = 10, scoring="accuracy").mean()

0.753183520599251

In [274]:
from sklearn.naive_bayes import ComplementNB

compnb = ComplementNB()
compnb.fit(x_train, y_train)

In [275]:
cross_val_score(compnb, x_train, y_train, cv = 10, scoring="accuracy").mean()

0.7531710362047441

In [276]:
from sklearn.naive_bayes import CategoricalNB

catnb = CategoricalNB()
catnb.fit(x_train, y_train)

In [277]:
cross_val_score(catnb, x_train, y_train, cv = 10, scoring="accuracy").mean()

0.7856429463171037

In [278]:
from sklearn.svm import SVC

svc = SVC(random_state=42)
svc.fit(x_train, y_train)

In [279]:
svc_score = cross_val_score(svc, x_train, y_train, cv = 10, scoring="accuracy")
svc_score

array([0.82222222, 0.78651685, 0.78651685, 0.85393258, 0.86516854,
       0.79775281, 0.80898876, 0.76404494, 0.83146067, 0.79775281])

In [280]:
pd.DataFrame(svc_score).describe()

Unnamed: 0,0
count,10.0
mean,0.811436
std,0.031761
min,0.764045
25%,0.789326
50%,0.803371
75%,0.829151
max,0.865169


In [281]:
from sklearn.neighbors import KNeighborsClassifier

knc = KNeighborsClassifier()
knc.fit(x_train, y_train)

In [282]:
knc_score = cross_val_score(knc, x_train, y_train, cv = 10, scoring="accuracy")
knc_score

array([0.75555556, 0.82022472, 0.76404494, 0.83146067, 0.83146067,
       0.78651685, 0.80898876, 0.79775281, 0.83146067, 0.78651685])

In [283]:
pd.DataFrame(knc_score).describe()

Unnamed: 0,0
count,10.0
mean,0.801398
std,0.02804
min,0.755556
25%,0.786517
50%,0.803371
75%,0.828652
max,0.831461


In [284]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [285]:
param_grid = {
    'C' : np.arange(1, 50),
    'kernel' : ['rbf'],
    'gamma' : ['scale', 'auto']
}

grid_search = GridSearchCV(svc, param_grid=param_grid, cv = 10, scoring='accuracy')
grid_search.fit(x_train, y_train)

In [286]:
grid_search.best_score_

0.8193133583021224

In [287]:
grid_search.best_params_

{'C': 8, 'gamma': 'scale', 'kernel': 'rbf'}

In [288]:
from scipy.stats import reciprocal, expon

param_distribs = {
        'kernel': ['rbf'],
        'C': reciprocal(20, 200_000),
        'gamma': expon(scale=1.0),
    }

In [289]:
svc_estimator = grid_search.best_estimator_

In [290]:
x_test = prepare_test_data(train_data, test_data)

In [294]:
pred = svc_estimator.predict(x_test)

In [300]:
predictions = pd.DataFrame({
    'PassengerId' : test_data['PassengerId'],
    'Survived' : pred
})

In [303]:
predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [307]:
predictions.to_csv('./predictions/pred_1.csv', index = False)