In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/AI/Projects/Titanic

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/AI/Projects/Titanic


# 1. Load Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

In [3]:
train_data = pd.read_csv('datasets_for_preprocessing/train_for_pipeline.csv')
X_test = pd.read_csv('datasets_for_preprocessing/test_for_pipeline.csv')

X_train = train_data.drop("Survived", axis = 1)
y_train = train_data["Survived"]

df_test = pd.read_csv('datasets/test.csv')

# II. Our two models using Grid Search

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection  import StratifiedKFold

cv = StratifiedKFold(n_splits=3, shuffle=False)

# 1. Knn

In [5]:
imp = SimpleImputer(strategy = 'mean')
scaler = StandardScaler()
#scaler = MinMaxScaler()
#pca = PCA()
model = KNeighborsClassifier()

steps = [('imp', imp), ("scaler", scaler),('model', model)]
pipe = Pipeline(steps)

param_grid = {"model__n_neighbors":[2,3,4,5,6,7,8,9,10]}


model_gridcv = GridSearchCV(pipe, param_grid = param_grid, cv=cv, scoring="accuracy")

model_gridcv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('imp',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='mean',
                                                      verbose=0)),
                                       ('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('model',
                                        KNeighborsClassifier(algorit

In [6]:
model_gridcv.best_params_, 

({'model__n_neighbors': 4},)

In [7]:
model_gridcv.best_score_, model_gridcv.score(X_train, y_train)

(0.8103254769921436, 0.8698092031425365)

In [9]:
pd.DataFrame(model_gridcv.cv_results_)[['params','mean_test_score' ]].sort_values("mean_test_score", ascending = False)

Unnamed: 0,params,mean_test_score
2,{'model__n_neighbors': 4},0.810325
7,{'model__n_neighbors': 9},0.806958
1,{'model__n_neighbors': 3},0.803591
0,{'model__n_neighbors': 2},0.802469
3,{'model__n_neighbors': 5},0.802469
4,{'model__n_neighbors': 6},0.802469
6,{'model__n_neighbors': 8},0.802469
5,{'model__n_neighbors': 7},0.801347
8,{'model__n_neighbors': 10},0.801347


In [21]:
def for_kaggle_and_save_model(model, model_gridcv):
  # a. For Kaggle
  y_pred = model_gridcv.predict(X_test)
  df_result = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": y_pred})
  df_result.to_csv("datasets_results/" + model.__class__.__name__ +".csv", index = False)
  # b. Save the model
  pkl_filename = "saved_models/" + model.__class__.__name__  + ".pkl"
  with open(pkl_filename, 'wb') as file:
    pickle.dump(model_gridcv, file)

In [22]:
for_kaggle_and_save_model(model, model_gridcv)

## 2. Kernel SVM

In [14]:
#imp = SimpleImputer(strategy = 'mean')
#scaler = StandardScaler()
scaler = MinMaxScaler()
svc = SVC()

steps = [('imp', imp), ('scaler', scaler), ('svc', svc)]
pipe = Pipeline(steps)



param_grid = {'svc__kernel': ['rbf', "sigmoid"], 'svc__C': [0.1, 0.15, 0.2, 0.3, 0.5, 1, 2, 2.5, 3, 3.5, 4, 5, 6, 7, 8, 9, 10]}
svc_gridcv = GridSearchCV(pipe, param_grid = param_grid, cv=cv, scoring="accuracy")

svc_gridcv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('imp',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='mean',
                                                      verbose=0)),
                                       ('scaler',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, cl...
   

In [15]:
svc_gridcv.best_params_, 

({'svc__C': 3, 'svc__kernel': 'rbf'},)

In [17]:
svc_gridcv.best_score_, svc_gridcv.score(X_train, y_train)

(0.8204264870931537, 0.8260381593714927)

In [19]:
pd.DataFrame(svc_gridcv.cv_results_)[['params','mean_test_score' ]].sort_values("mean_test_score", ascending = False).head()

Unnamed: 0,params,mean_test_score
20,"{'svc__C': 4, 'svc__kernel': 'rbf'}",0.820426
18,"{'svc__C': 3.5, 'svc__kernel': 'rbf'}",0.820426
16,"{'svc__C': 3, 'svc__kernel': 'rbf'}",0.820426
14,"{'svc__C': 2.5, 'svc__kernel': 'rbf'}",0.819304
22,"{'svc__C': 5, 'svc__kernel': 'rbf'}",0.819304


In [23]:
for_kaggle_and_save_model(svc, svc_gridcv)