In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
train = pd.read_csv("/kaggle/input/titanic/train.csv")

In [2]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [3]:
import pickle
import os
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import GridSearchCV

print("Fill na and set index: FillSet")
def FillNA(X, ind, cat, num, col='Cabin'):

    X = X.set_index(ind)
    X[cat] = X[cat].fillna('Missing')
    X[num] = X[num].fillna(X[num].median())
    X[col] = X[col].apply(lambda x: x[0])

    return X

print("Title function: Title")
def Title(df, col):
    title = []
    for i in df[col]:
        if 'Miss.' in i:
            title.append('Miss.')
        elif 'Mrs.' in i:
            title.append('Mrs.')
        elif 'Mr.' in i:
            title.append('Mr.')
        elif 'Master.' in i:
            title.append('Master.')
        elif 'Major.' in i:
            title.append('Military.')
        elif 'Col.' in i:
            title.append('Military.')
        else:
            title.append('No')

    df[col] = title

    return df

print("MinMaxScaler: Stand")

def Stand(df, col):
    minmax = MinMaxScaler()
    df[col] = minmax.fit_transform(df[col])
    
    return df

print("One hot decoder imported: Decoder")
class Decoder(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        super().__init__()

        self.columns = columns
        self.onehot = OneHotEncoder(drop='first')

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X):
        one = self.onehot.fit_transform(X[self.columns]).toarray()
        col_names = self.onehot.get_feature_names()

        return pd.concat([X.drop(self.columns, axis=1), pd.DataFrame(one, index=X.index, columns=col_names)], axis=1)


print("Select imported: SelectCol")
def SelectCol(X, drop_cols, target, col_str):
    if col_str is not None:
        for i in col_str:
            X.apply(lambda i: str(i))

    X = X.drop(drop_cols, axis=1)

    train = X[~X.Survived.isna()]
    test = X[X.Survived.isna()]

    return train.drop(target, axis=1), train[target], test.drop(target, axis=1)



print("Grid Search train: GSCV")
def GSCV(pipe, params, X, y, test, submission, m, scoring, cv=5):
    grid = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        cv=cv,
                        iid=False,
                        return_train_score=False,
                        refit=True,
                        scoring=scoring
                       )
    grid.fit(X, y)
    pd.DataFrame(grid.predict(test), index=submission.index, columns=['Survived']).to_csv("/kaggle/working/" + m + "_submission.csv")
    return grid.best_score_, grid.best_params_, grid.best_estimator_.predict(test)



print("Test set concat training: TrainGridCV")

def TrainGridCV(pipe, params, X, y, test, d, m, submission, cv=5):
    grid = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        cv=cv,
                        iid=False,
                        return_train_score=False,
                        refit=True
                        )
    grid.fit(X, y)
    name = d + m + "_pred"
    test_pred = grid.best_estimator_.predict(test)
    test_prob = grid.predict_proba(test)

    submission = pd.DataFrame({'PassengerId': submission['PassengerId'],
                  'Survived': test_pred})
    submission['Survived'] = submission['Survived'].astype('int64')
    submission.to_csv(name+'.csv', index=False)
    
    with open(name + ".pickle", 'wb') as handle:
        pickle.dump(grid.best_estimator_, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    return grid.best_score_, grid.best_params_, test_pred, test_prob, submission


Fill na and set index: FillSet
Title function: Title
MinMaxScaler: Stand
One hot decoder imported: Decoder
Select imported: SelectCol
Grid Search train: GSCV
Test set concat training: TrainGridCV


In [4]:
test.insert(test.shape[1], "Survived", [np.nan]*len(test))
print(f"Train dim: {train.shape}, Test dim: {test.shape}")

df = pd.concat([train, test], sort=True)
print(f"DF dimension: {df.shape}")

Train dim: (891, 12), Test dim: (418, 12)
DF dimension: (1309, 12)


In [5]:
print("<<<<< Original dataset >>>>>>")
print(f"Missing: {df.isna().sum()}")
df = FillNA(df, 'PassengerId',['Cabin', 'Embarked'], ['Age','Fare'])
print("<<<<< Filled NA >>>>>>")
print(f"Missing: {df.isna().sum()}")

<<<<< Original dataset >>>>>>
Missing: Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64
<<<<< Filled NA >>>>>>
Missing: Age           0
Cabin         0
Embarked      0
Fare          0
Name          0
Parch         0
Pclass        0
Sex           0
SibSp         0
Survived    418
Ticket        0
dtype: int64


In [6]:
df = Title(df, 'Name')
df.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,M,S,7.25,Mr.,0,3,male,1,0.0,A/5 21171
2,38.0,C,C,71.2833,Mrs.,0,1,female,1,1.0,PC 17599
3,26.0,M,S,7.925,Miss.,0,3,female,0,1.0,STON/O2. 3101282
4,35.0,C,S,53.1,Mrs.,0,1,female,1,1.0,113803
5,35.0,M,S,8.05,Mr.,0,3,male,0,0.0,373450


In [7]:
df = Stand(df, ["Fare"])
df.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,M,S,0.014151,Mr.,0,3,male,1,0.0,A/5 21171
2,38.0,C,C,0.139136,Mrs.,0,1,female,1,1.0,PC 17599
3,26.0,M,S,0.015469,Miss.,0,3,female,0,1.0,STON/O2. 3101282
4,35.0,C,S,0.103644,Mrs.,0,1,female,1,1.0,113803
5,35.0,M,S,0.015713,Mr.,0,3,male,0,0.0,373450


In [8]:
df = Decoder(['Cabin', 'Embarked', 'Pclass', 'Sex', 'Name']).fit_transform(df)
print(f"New dimension: {df.shape}")

New dimension: (1309, 25)


In [9]:
X, y, test_data = SelectCol(df, ['Ticket'], 'Survived', None)
X.shape, y.shape, test_data.shape

((891, 23), (891,), (418, 23))

In [10]:
model = []
accuracy = []
best_params = []
runtime = []
step = 0
X_new, y_new = X.copy().values, y.copy().values
test_data = test_data.copy().values
directory = "/kaggle/working/"
indx = np.array([0]*test_data.shape[0])
cut_off = 0.9
lr = 0.95

In [11]:
from sklearn.ensemble import RandomForestClassifier

param_rf = {'n_estimators': np.arange(3,20).tolist(),
            'criterion': ['gini','entropy'],
            'max_depth':np.arange(2,10).tolist()}

est_rf = RandomForestClassifier(random_state=123)

In [12]:
while cut_off > 0.6:
    step += 1
    start = datetime.now()
    print(f"**** Epoch: {step} started ****")
    print(f"New X: {X_new.shape}, New y: {y_new.shape}")
    m = str(step) + "_rf_"
    f1_score, params, estimation, prob, sub = TrainGridCV(pipe=est_rf, 
                                              params=param_rf, 
                                              X=X_new, 
                                              y=y_new, 
                                              test=test_data,
                                              d=directory,
                                              m=m,
                                              submission=gender_submission)
    end = datetime.now()
    
    X_new = X.copy().values
    y_new = y.copy().values
    
    indx += np.array(prob[:,1] > cut_off)
    indx = [False if i == 0 else True for i in indx]
    
    X_new = np.vstack([X_new, test_data[indx]])
    y_new = np.hstack([y_new, estimation[indx]])
    
    cut_off = cut_off * lr
    
    model.append(str(step) + "_rf")
    accuracy.append(f1_score)
    best_params.append(params)
    runtime.append(end-start)

    print("Train score: {:.4f}, Train dim: {}, Added: {}".format(f1_score, X_new.shape, sum(indx)))
    print("Best parameters: {}, Cut off: {:.4f}, Train Runtime: {}".format(params, cut_off, end-start))

**** Epoch: 1 started ****
New X: (891, 23), New y: (891,)
Train score: 0.8362, Train dim: (939, 23), Added: 48
Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 8}, Cut off: 0.8550, Train Runtime: 0:00:41.657833
**** Epoch: 2 started ****
New X: (939, 23), New y: (939,)
Train score: 0.8457, Train dim: (972, 23), Added: 81
Best parameters: {'criterion': 'gini', 'max_depth': 9, 'n_estimators': 17}, Cut off: 0.8122, Train Runtime: 0:00:41.094104
**** Epoch: 3 started ****
New X: (972, 23), New y: (972,)
Train score: 0.8438, Train dim: (995, 23), Added: 104
Best parameters: {'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 14}, Cut off: 0.7716, Train Runtime: 0:00:40.663316
**** Epoch: 4 started ****
New X: (995, 23), New y: (995,)
Train score: 0.8524, Train dim: (1005, 23), Added: 114
Best parameters: {'criterion': 'gini', 'max_depth': 7, 'n_estimators': 11}, Cut off: 0.7331, Train Runtime: 0:00:40.675464
**** Epoch: 5 started ****
New X: (1005, 23), New y:

The end of the notebook