In [5]:
import pandas as pd

titanic = pd.read_csv("dataset/train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
X_train = titanic.drop(['Survived'], axis=1)
y_train = titanic['Survived'].values
X_test = pd.read_csv("dataset/test.csv")

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

group1 = titanic[['Age', 'Pclass']].dropna().groupby(['Pclass']).median()
group1

Unnamed: 0_level_0,Age
Pclass,Unnamed: 1_level_1
1,37.0
2,29.0
3,24.0


In [4]:

class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        ret = X.copy()
        for passenger in X[X['Age'].isnull()].index:
            ret.loc[passenger, 'Age'] = group1.loc[X.loc[passenger, 'Pclass'], 'Age']
        return ret

titanic_age_imputer = AgeImputer()
titanic_age_imputed = titanic_age_imputer.transform(titanic)
titanic_age_imputed

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,24.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
num = titanic_age_imputed.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
cat = titanic_age_imputed.loc[:, ['Sex', 'Embarked']]

In [8]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='median')
num_imputed = num_imputer.fit_transform(num)
num_imputed

array([[ 3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 3.    , 24.    ,  1.    ,  2.    , 23.45  ],
       [ 1.    , 26.    ,  0.    ,  0.    , 30.    ],
       [ 3.    , 32.    ,  0.    ,  0.    ,  7.75  ]])

In [9]:
titanic_age_bucket = pd.DataFrame(num_imputer.fit_transform(titanic[['Survived', 'Age']]), columns=['Survived', 'Age'])
titanic_age_bucket['AgeBucket'] = titanic_age_bucket['Age'] // 15 * 15
age_bucket = titanic_age_bucket[
    ['Survived', 'AgeBucket']].groupby(['AgeBucket']).mean()
age_bucket

Unnamed: 0_level_0,Survived
AgeBucket,Unnamed: 1_level_1
0.0,0.576923
15.0,0.337474
30.0,0.423256
45.0,0.404494
60.0,0.24
75.0,1.0


In [10]:

import numpy as np

class AttributeAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        sibsp_parch = X[:, 2] + X[:, 3]
        age_surv_rate = [age_bucket.loc[X[i, 1] // 15 * 15] for i in range(len(X))]
        return np.c_[np.delete(X, [1, 2, 3], axis=1), sibsp_parch, age_surv_rate]

num_attribs_adder = AttributeAdder()
num_attribs_added = num_attribs_adder.fit_transform(num_imputed)
num_attribs_added

array([[ 3.        ,  7.25      ,  1.        ,  0.33747412],
       [ 1.        , 71.2833    ,  1.        ,  0.42325581],
       [ 3.        ,  7.925     ,  0.        ,  0.33747412],
       ...,
       [ 3.        , 23.45      ,  3.        ,  0.33747412],
       [ 1.        , 30.        ,  0.        ,  0.33747412],
       [ 3.        ,  7.75      ,  0.        ,  0.42325581]])

In [11]:
from sklearn.preprocessing import StandardScaler

num_scaler = StandardScaler()
num_scaled = num_scaler.fit_transform(num_attribs_added)
num_scaled

array([[ 0.82737724, -0.50244517,  0.05915988, -0.64534159],
       [-1.56610693,  0.78684529,  0.05915988,  0.47863088],
       [ 0.82737724, -0.48885426, -0.56097483, -0.64534159],
       ...,
       [ 0.82737724, -0.17626324,  1.29942929, -0.64534159],
       [-1.56610693, -0.04438104, -0.56097483, -0.64534159],
       [ 0.82737724, -0.49237783, -0.56097483,  0.47863088]])

In [12]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='median')), 
    ('num_attribs_adder', AttributeAdder()), 
    ('num_scaler', StandardScaler()), 
    ])

num_preprocessed = num_pipeline.fit_transform(num)
num_preprocessed

array([[ 0.82737724, -0.50244517,  0.05915988, -0.64534159],
       [-1.56610693,  0.78684529,  0.05915988,  0.47863088],
       [ 0.82737724, -0.48885426, -0.56097483, -0.64534159],
       ...,
       [ 0.82737724, -0.17626324,  1.29942929, -0.64534159],
       [-1.56610693, -0.04438104, -0.56097483, -0.64534159],
       [ 0.82737724, -0.49237783, -0.56097483,  0.47863088]])

In [13]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

cat_imputer = MostFrequentImputer()
cat_imputed = cat_imputer.fit_transform(cat)
cat_imputed

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [14]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
cat_encoded = cat_encoder.fit_transform(cat_imputed)
cat_encoded



array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [15]:
cat_pipeline = Pipeline([
    ('cat_imputer', MostFrequentImputer()), 
    ('cat_encoder', OneHotEncoder(sparse=False)), 
    ])

cat_preprocess = cat_pipeline.fit_transform(cat)
cat_preprocess



array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [16]:
from sklearn.compose import ColumnTransformer

num_attribs = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_attribs = ['Sex', 'Embarked']

num_cat_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_attribs), 
    ('cat_pipeline', cat_pipeline, cat_attribs), 
    ])

manual_preprocessed = np.c_[num_scaled, cat_encoded]
manual_preprocessed

array([[ 0.82737724, -0.50244517,  0.05915988, ...,  0.        ,
         0.        ,  1.        ],
       [-1.56610693,  0.78684529,  0.05915988, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.82737724, -0.48885426, -0.56097483, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.82737724, -0.17626324,  1.29942929, ...,  0.        ,
         0.        ,  1.        ],
       [-1.56610693, -0.04438104, -0.56097483, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.82737724, -0.49237783, -0.56097483, ...,  0.        ,
         1.        ,  0.        ]])

In [17]:
import numpy as np

preprocess_pipeline = Pipeline([
    ('data_age_imputer', AgeImputer()), 
    ('num_cat_pipeline', num_cat_pipeline), 
    ])

X_train_preprocessed = preprocess_pipeline.fit_transform(X_train)

assert np.allclose(manual_preprocessed, X_train_preprocessed)



In [None]:
from sklearnex import patch_sklearn

patch_sklearn()

In [18]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_clf = SVC(gamma='auto')
svm_scores = cross_val_score(svm_clf, X_train_preprocessed, y_train,
                             cv=10, scoring='accuracy')
print("svm_score: ", svm_scores.mean())

svm_score:  0.8339076154806492


In [19]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100)
forest_scores = cross_val_score(forest_clf, X_train_preprocessed, y_train, 
                                cv=10, scoring='accuracy')
print("forest_score: ", forest_scores.mean())

forest_score:  0.8125967540574284


In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = [
        {'kernel': ['linear'], 
          'C': [10., 30., 100., 300.]},
        {'kernel': ['rbf'], 
          'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
        ]

grid_search = GridSearchCV(svm_clf, param_grid, cv=5, 
                            scoring='accuracy', verbose=2)
grid_search.fit(X_train_preprocessed, y_train)

Fitting 5 folds for each of 46 candidates, totalling 230 fits
[CV] END ..............................C=10.0, kernel=linear; total time=   0.2s
[CV] END ..............................C=10.0, kernel=linear; total time=   0.0s
[CV] END ..............................C=10.0, kernel=linear; total time=   0.1s
[CV] END ..............................C=10.0, kernel=linear; total time=   0.0s
[CV] END ..............................C=10.0, kernel=linear; total time=   0.0s
[CV] END ..............................C=30.0, kernel=linear; total time=   0.2s
[CV] END ..............................C=30.0, kernel=linear; total time=   0.2s
[CV] END ..............................C=30.0, kernel=linear; total time=   0.3s
[CV] END ..............................C=30.0, kernel=linear; total time=   0.1s
[CV] END ..............................C=30.0, kernel=linear; total time=   0.0s
[CV] END .............................C=100.0, kernel=linear; total time=   0.9s
[CV] END .............................C=100.0, 

In [21]:
full_pipeline = Pipeline([
    ('preprocess_pipeline', preprocess_pipeline), 
    ('classifier', grid_search.best_estimator_) , 
    ])

In [22]:
import joblib

full_pipeline.fit(X_train, y_train)
joblib.dump(full_pipeline, "model.pkl")



['model.pkl']

In [23]:
model = joblib.load("model.pkl")
predictions = model.predict(X_test)

In [25]:
import os

def write_csv(predictions, name='submit.csv'):
    dframe = pd.DataFrame(np.array([list(range(892, 892 + 418)), predictions]).T, 
                          columns=['PassengerId', 'Survived'])
    csv_path = os.path.join("dataset", name)
    dframe.to_csv(csv_path, index=False)

write_csv(predictions, "predictions.csv")