In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/AI/Projects/Titanic

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/AI/Projects/Titanic


## 1. Load Data

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

In [4]:
train_data = pd.read_csv('datasets_for_preprocessing/train_for_pipeline.csv')
X_test = pd.read_csv('datasets_for_preprocessing/test_for_pipeline.csv')

X_train = train_data.drop("Survived", axis = 1)
y_train = train_data["Survived"]

df_test = pd.read_csv('datasets/test.csv')

## 2. Models



In [14]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

cv = StratifiedKFold(n_splits=3, shuffle=False)

In [15]:
def for_kaggle_and_save_model(model, model_gridcv):
  # a. For Kaggle
  y_pred = model_gridcv.predict(X_test)
  df_result = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": y_pred})
  df_result.to_csv("datasets_results/" + model.__class__.__name__ +".csv", index = False)
  # b. Save the model
  pkl_filename = "saved_models/" + model.__class__.__name__  + ".pkl"
  with open(pkl_filename, 'wb') as file:
    pickle.dump(model_gridcv, file)

## a. Forest

In [16]:
imp = SimpleImputer(strategy = 'mean')
forest = RandomForestClassifier(random_state=123)

steps = [('imp', imp), ('forest', forest)]
pipe = Pipeline(steps)

param_grid = {'forest__n_estimators': [100, 200, 300, 400], 'forest__max_depth':[2,3,4,5,6]}
forest_gridcv = GridSearchCV(pipe, param_grid = param_grid, cv=cv, scoring="accuracy")

forest_gridcv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('imp',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='mean',
                                                      verbose=0)),
                                       ('forest',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
      

In [17]:
forest_gridcv.best_params_,

({'forest__max_depth': 5, 'forest__n_estimators': 100},)

In [18]:
 forest_gridcv.best_score_ , forest_gridcv.score(X_train, y_train)

(0.8181818181818182, 0.8496071829405163)

In [19]:
for_kaggle_and_save_model(forest, forest_gridcv)

## b. Gradient Boosting

In [22]:
gb = GradientBoostingClassifier()

steps = [('imp', imp), ('gb', gb)]
pipe = Pipeline(steps)

param_grid = {'gb__n_estimators': [100, 200, 300, 400], 'gb__max_depth':[2,3,4,5,6], "gb__learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3]}
gb_gridcv = GridSearchCV(pipe, param_grid = param_grid, cv=cv, scoring="accuracy")

gb_gridcv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('imp',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='mean',
                                                      verbose=0)),
                                       ('gb',
                                        GradientBoostingClassifier(ccp_alpha=0.0,
                                                                   criterion='friedman_mse',
                                                                   init=None,
                                                                   learning_r

In [23]:
gb_gridcv.best_params_

{'gb__learning_rate': 0.1, 'gb__max_depth': 2, 'gb__n_estimators': 300}

In [24]:
gb_gridcv.best_score_ , gb_gridcv.score(X_train, y_train)

(0.8215488215488217, 0.8945005611672279)

In [25]:
for_kaggle_and_save_model(gb, gb_gridcv)