In [1]:
import pandas as pd
import numpy as np
import scipy
import joblib
from sklearn import impute, preprocessing, model_selection, base, linear_model, pipeline, ensemble, svm, neighbors, compose, tree

In [2]:
titanic_train, titanic_test = pd.read_csv('data/train.csv'), pd.read_csv('data/test.csv')

X_train, y_train = titanic_train.drop(['Survived', 'PassengerId'], axis=1), titanic_train['Survived']
X_test = titanic_test.drop(['PassengerId'], axis=1)

#### Data Preparation

In [48]:
X_train

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [49]:
X_train.count()/len(X_train)

Pclass      1.000000
Name        1.000000
Sex         1.000000
Age         0.801347
SibSp       1.000000
Parch       1.000000
Ticket      1.000000
Fare        1.000000
Cabin       0.228956
Embarked    0.997755
dtype: float64

In [50]:
X_train['Cabin'].value_counts(dropna=False)

Cabin
NaN            687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: count, Length: 148, dtype: int64

In [51]:
X_train['Ticket'].value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

First, I am going to extract Title from the full Name, as it might convey information about the status in society. VIP people might have higher chances of surviving. Second, From the SibSp and Parch attributes, I calculate the total number of family members of the passenger on the ship. Family members might help each other survive.

In [52]:
# Extract Title from Name
class TitleExtractor(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        self.feature_names_in_= None
    def fit(self, X, y=None):
        self.feature_names_in_=X.columns
        return self
    def transform(self, X):
        return pd.DataFrame(X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False))
    def get_feature_names_out(self, inp=None):
        return ['title']


# Calculating Total Family Members
class FamilyExtractor(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        self.feature_names_in_ = None
    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns
        return self
    def transform(self, X):
        return pd.DataFrame(X['Parch']+X['SibSp'])
    def get_feature_names_out(self, inp=None):
        return ['family_mems']

Next, in the following cell, I have created a Custom Sklearn Transformer, which tackles the Cabin and Ticket attribute. The Cabin attribute has 687 null values, but the non null values might reveal a couple of things - the number of cabins and the deck. These seem to be useful as where the cabin was in the ship might be a decisive factor in whether the passenger survived or not.
Also, I am categorizing the Ticket attribute into 13 categories.
To summarize, in the following cell we carry out the following feature engineering steps:   

* Extract Number of Cabins booked by the Passenger
* Extract the Deck the Cabin is in (The first letter of the Cabin)
* Categorize Ticket attribute
* OneHot Encode the Ticket and Deck attributes
* And finally remove the original Ticket and Cabin attribbutes as they are no longer needed.

In [68]:
# Transformer for Cabin and Ticket

class CabinTicketTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, type):
        self.new_X = pd.DataFrame()
        self.old_X = None
        self.type = type
        if self.type=='Cabin':
            self.encoder = preprocessing.OneHotEncoder(drop=[np.nan], handle_unknown='ignore')
        elif self.type=='Ticket':
            self.encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
        else:
            raise TypeError('Only Cabin or Ticket columns are supported')

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        self.old_X = pd.Series(X.squeeze())
        if self.type=='Cabin':
            decks = self.encoder.fit_transform(pd.DataFrame(self.old_X.str[0])).toarray()       #toarray because sparse matrix
            cabins = self.old_X.apply(lambda x: 0 if pd.isna(x) else len(str(x).split(' ')))
            self.new_X = pd.DataFrame(np.c_[decks, cabins])
            return self.new_X
        else:
            new_features = pd.DataFrame()

            new_features['Ticket_Cat'] = self.old_X.apply(self.ticket_cat)
            new_features['IsNumeric'] = self.old_X.apply(lambda x: 1 if x.split(' ')[-1].isdigit() else 0)
    
            one_hot_features = self.encoder.fit_transform(new_features[['Ticket_Cat']])
    
            # If one_hot_features is sparse, convert it to dense
            if isinstance(one_hot_features, scipy.sparse.csr_matrix):
                one_hot_features = one_hot_features.toarray()
    
            # Make sure both arrays have the same number of rows
            assert one_hot_features.shape[0] == new_features[['IsNumeric']].shape[0]
    
            # Combine one-hot encoded and numerical features
            final_features = np.hstack([one_hot_features, new_features[['IsNumeric']].values])
    
            return final_features

    def get_feature_names_out(self, inp=None):
        feats = list(self.encoder.get_feature_names_out())
        return feats + ['IsNumeric'] if self.type == 'Ticket' else feats + ['Cabins']

    def ticket_cat(self, row):
        ticket=row
        if ticket.startswith('W'):
            if 'C' in ticket:
                return 1
            else:
                return 2
        if ticket.startswith('S'):
            if 'TON' in ticket:
                return 3
            elif 'PARIS' in ticket.upper():
                return 4
            else:
                return 5
        if ticket.startswith('A'):
            return 6
        if ticket.startswith('C'):
            if 'A' in ticket:
                return 7
            else:
                return 8
        if ticket.startswith('F'):
            return 9
        if ticket.startswith('L'):
            return 10
        if ticket.startswith('P'):
            if 'C' in ticket:
                return 11
            else:
                return 12

In [74]:
# Pipeline for imputing and scaling the ``Age`` and ``Fare`` attribute
num_pipe = pipeline.Pipeline([('num_impute', impute.SimpleImputer(strategy='median')),
                              ('num_scale', preprocessing.StandardScaler())])

# Pipeline for Embarked attribute
emb_pipe = pipeline.Pipeline([('emb_impute', impute.SimpleImputer(strategy='most_frequent')),
                              ('emb_encode', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

# Transformer for creating new attributes - family members and title
column_creator = compose.ColumnTransformer([('title_create', TitleExtractor(), ['Name']),
                                            ('fam_create', FamilyExtractor(), ['Parch', 'SibSp'])], remainder='passthrough')

# Final Column Transformer
column_transformer = compose.ColumnTransformer([('num_pipe', num_pipe, [4,6]),
                                                ('title_encode', preprocessing.OneHotEncoder(handle_unknown='ignore'), [0]),
                                           ('cab_pipe', CabinTicketTransformer('Cabin'), [7]),
                                           ('ticket_pipe', CabinTicketTransformer('Ticket'), [5]),
                                           ('emb_pipe', emb_pipe, [8]),
                                           ('class_encode', preprocessing.OrdinalEncoder(), [2]),
                                           ('sex_pipe', preprocessing.OrdinalEncoder(), [3])], remainder='passthrough', sparse_threshold=0)

# Final Pipeline
full_pipeline = pipeline.Pipeline([('new_cols', column_creator),
                                   ('all_columns_trans', column_transformer)])

X_train_prep = full_pipeline.fit_transform(X_train) 

In [76]:
full_pipeline.get_feature_names_out()

array(['num_pipe__remainder__Age', 'num_pipe__remainder__Fare',
       'title_encode__title_create__title_Capt',
       'title_encode__title_create__title_Col',
       'title_encode__title_create__title_Countess',
       'title_encode__title_create__title_Don',
       'title_encode__title_create__title_Dr',
       'title_encode__title_create__title_Jonkheer',
       'title_encode__title_create__title_Lady',
       'title_encode__title_create__title_Major',
       'title_encode__title_create__title_Master',
       'title_encode__title_create__title_Miss',
       'title_encode__title_create__title_Mlle',
       'title_encode__title_create__title_Mme',
       'title_encode__title_create__title_Mr',
       'title_encode__title_create__title_Mrs',
       'title_encode__title_create__title_Ms',
       'title_encode__title_create__title_Rev',
       'title_encode__title_create__title_Sir', 'cab_pipe__x0_A',
       'cab_pipe__x0_B', 'cab_pipe__x0_C', 'cab_pipe__x0_D',
       'cab_pipe__x0_E', 

In [77]:
X_train_prep

array([[-0.5657364610748746, -0.5024451714361923, 0.0, ..., 2.0, 1.0, 1],
       [0.6638610320657843, 0.7868452935884461, 0.0, ..., 0.0, 0.0, 1],
       [-0.2583370877897099, -0.4888542575852486, 0.0, ..., 2.0, 0.0, 0],
       ...,
       [-0.10463740114712752, -0.17626323901354432, 0.0, ..., 2.0, 0.0,
        3],
       [-0.2583370877897099, -0.04438103794142432, 0.0, ..., 0.0, 1.0, 0],
       [0.20276197213803718, -0.49237782784290063, 0.0, ..., 2.0, 1.0, 0]],
      dtype=object)

We finally end up with the following 50 features, fairly more than the original dataset.

In [78]:
joblib.dump(full_pipeline, 'saved_models/full_pipeline.pkl')

['saved_models/full_pipeline.pkl']

In [79]:
final_df = pd.DataFrame(X_train_prep, columns=list(full_pipeline.get_feature_names_out()))
final_df

Unnamed: 0,num_pipe__remainder__Age,num_pipe__remainder__Fare,title_encode__title_create__title_Capt,title_encode__title_create__title_Col,title_encode__title_create__title_Countess,title_encode__title_create__title_Don,title_encode__title_create__title_Dr,title_encode__title_create__title_Jonkheer,title_encode__title_create__title_Lady,title_encode__title_create__title_Major,...,ticket_pipe__Ticket_Cat_11.0,ticket_pipe__Ticket_Cat_12.0,ticket_pipe__Ticket_Cat_nan,ticket_pipe__IsNumeric,emb_pipe__remainder__Embarked_C,emb_pipe__remainder__Embarked_Q,emb_pipe__remainder__Embarked_S,class_encode__remainder__Pclass,sex_pipe__remainder__Sex,remainder__fam_create__family_mems
0,-0.565736,-0.502445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1.0,1
1,0.663861,0.786845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
2,-0.258337,-0.488854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0
3,0.433312,0.42073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1
4,0.433312,-0.486337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.181487,-0.386671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0
887,-0.796286,-0.044381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0
888,-0.104637,-0.176263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,3
889,-0.258337,-0.044381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0


In [80]:
final_df.to_csv('data/final_df.csv')

I am renaming the ``X_train_prep`` to `X_train`, this should not be confused with X_train above, we are no longer going to use that.

In [81]:
# Prepared Dataframe
cols = final_df.columns

# Arrays for modelling phase
X_train = X_train_prep
# y_train

#### Training

In [83]:
sgd_clf = linear_model.SGDClassifier()
knn_clf = neighbors.KNeighborsClassifier()
dt_clf = tree.DecisionTreeClassifier()
svm_clf = svm.SVC()

scores = []
for model in [sgd_clf, knn_clf, dt_clf, svm_clf]:
    scores.append(model_selection.cross_validate(model, X_train, y_train, cv=3, scoring='accuracy'))

In [84]:
[scores[i]['test_score'].mean() for i in range(4)]

[0.7811447811447811,
 0.8114478114478114,
 0.7631874298540966,
 0.8338945005611672]

Clearly, SVC and KNN show the best initial fit results. Let's explore some hyperparameter settings to see if this can be improved.

In [652]:
params_grid = [{'C':[0.1, 1, 10, 100], 'kernel':['rbf', 'sigmoid'], 'gamma':['auto', 'scale']}, {'C':[0.1, 1, 10, 100], 'kernel':['poly'], 'gamma':['auto', 'scale'], 'degree':[2,3,4,5]}]
grid = model_selection.GridSearchCV(svm_clf, params_grid, cv=3, scoring='accuracy', n_jobs=-1)

grid.fit(X_train, y_train)

In [653]:
pd.DataFrame({'scores':grid.cv_results_['mean_test_score'], 'params':grid.cv_results_['params']}).sort_values(by='scores', ascending=False)

Unnamed: 0,scores,params
6,0.833895,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}"
35,0.830527,"{'C': 10, 'degree': 3, 'gamma': 'scale', 'kern..."
10,0.829405,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"
8,0.829405,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}"
42,0.828283,"{'C': 100, 'degree': 3, 'gamma': 'auto', 'kern..."
33,0.828283,"{'C': 10, 'degree': 2, 'gamma': 'scale', 'kern..."
12,0.82716,"{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}"
32,0.82716,"{'C': 10, 'degree': 2, 'gamma': 'auto', 'kerne..."
37,0.826038,"{'C': 10, 'degree': 4, 'gamma': 'scale', 'kern..."
25,0.826038,"{'C': 1, 'degree': 2, 'gamma': 'scale', 'kerne..."


In [664]:
params_grid2 = [{'C':[11, 13, 15], 'kernel':['rbf'], 'gamma':['auto', 'scale']},
                {'C':[50, 100, 15], 'kernel':['poly'], 'degree':[2,3,4]}]
grid2 = model_selection.GridSearchCV(svm_clf, params_grid2, cv=3, scoring='accuracy', n_jobs=-1)

grid2.fit(X_train, y_train)

In [758]:
print(f'Best Score: {grid2.best_score_}\nBest Parameters: {grid2.best_params_}')

Best Score: 0.8327721661054994
Best Parameters: {'C': 15, 'degree': 3, 'kernel': 'poly'}


Let's try for KNN as well. After a while of wide search, I end up with following parameters for narrow search.

In [100]:
knn_params = {
    'n_neighbors': [11, 12, 13,14],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'p': [1, 2],
    'leaf_size': [10, 20, 30]
}

knn_grid = model_selection.GridSearchCV(knn_clf, knn_params, cv=3, n_jobs=-1, verbose=2)
knn_grid.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [101]:
pd.DataFrame({'scores':knn_grid.cv_results_['mean_test_score'], 'params':knn_grid.cv_results_['params']}).sort_values(by='scores', ascending=False)

Unnamed: 0,scores,params
0,0.830527,"{'algorithm': 'auto', 'leaf_size': 10, 'n_neig..."
32,0.829405,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n..."
16,0.829405,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n..."
12,0.823793,"{'algorithm': 'auto', 'leaf_size': 10, 'n_neig..."
44,0.823793,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n..."
28,0.823793,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n..."
4,0.818182,"{'algorithm': 'auto', 'leaf_size': 10, 'n_neig..."
36,0.818182,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n..."
20,0.818182,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n..."
40,0.815937,"{'algorithm': 'kd_tree', 'leaf_size': 10, 'n_n..."


And lastly, a Random Forest Classifier.

In [108]:
rf_params = {
    'n_estimators': [40, 50, 60],
    'max_features': ['sqrt', 'log2', 'auto'],
    'max_depth': [40, 50, 60],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4, 6],
    'bootstrap': [True, False]
}

rf_clf = ensemble.RandomForestClassifier()

rf_grid = model_selection.GridSearchCV(rf_clf, rf_params, cv=3, verbose=1, n_jobs=-1)
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [109]:
pd.DataFrame({'scores':rf_grid.cv_results_['mean_test_score'], 'params':rf_grid.cv_results_['params']}).sort_values(by='scores', ascending=False)

Unnamed: 0,scores,params
21,0.833895,"{'bootstrap': True, 'max_depth': 50, 'max_feat..."
97,0.833895,"{'bootstrap': False, 'max_depth': 60, 'max_fea..."
5,0.831650,"{'bootstrap': True, 'max_depth': 40, 'max_feat..."
54,0.831650,"{'bootstrap': False, 'max_depth': 40, 'max_fea..."
37,0.831650,"{'bootstrap': True, 'max_depth': 60, 'max_feat..."
...,...,...
31,0.806958,"{'bootstrap': True, 'max_depth': 50, 'max_feat..."
24,0.805836,"{'bootstrap': True, 'max_depth': 50, 'max_feat..."
89,0.805836,"{'bootstrap': False, 'max_depth': 50, 'max_fea..."
13,0.804714,"{'bootstrap': True, 'max_depth': 40, 'max_feat..."


Finally, let's club all three of theses classifiers, with their best parameters, into a Voting Classifier

In [110]:
rf_clf = ensemble.RandomForestClassifier(n_estimators=50, max_features='sqrt', min_samples_leaf=2, min_samples_split=5, max_depth=50, bootstrap=False)
svm_clf = svm.SVC(C=15, kernel='poly', degree=3, probability=True)
knn_clf = neighbors.KNeighborsClassifier(algorithm='auto', leaf_size=10, n_neighbors=11, p=1, weights='uniform')

voting_clf = ensemble.VotingClassifier(estimators=[('rf', rf_clf), ('svm', svm_clf), ('knn', knn_clf)], voting='soft')

voting_clf.fit(X_train, y_train)

In [111]:
joblib.dump(voting_clf, 'saved_models/voting_clf.pkl')

['saved_models/voting_clf.pkl']

In [None]:
X_test_prep = full_pipeline.transform(X_test)

Now, before we go to prediction stage, there is a slight problem. The test set had no people residing in the 'T' Cabin, because of which the `X_test_prep` has one less column. Let us fix that by inserting the Cabin T column at correct index.

In [113]:
list(cols).index('cab_pipe__x0_T')

26

In [114]:
Cabin_T = np.zeros(X_test_prep.shape[0])
X_test_prep = np.insert(X_test_prep, 26, Cabin_T, axis=1)
X_test_prep

array([[0.3948865804412651, -0.49078316061772326, 0.0, ..., 2.0, 1.0, 0],
       [1.3555096219574048, -0.5074788432328381, 0.0, ..., 2.0, 0.0, 1],
       [2.5082572717767726, -0.4533668714188957, 0.0, ..., 1.0, 1.0, 0],
       ...,
       [0.7022859537264299, -0.5024451714361923, 0.0, ..., 2.0, 1.0, 0],
       [-0.10463740114712752, -0.4863374216869257, 0.0, ..., 2.0, 1.0, 0],
       [-0.10463740114712752, -0.19824427701513722, 0.0, ..., 2.0, 1.0,
        2]], dtype=object)

In [119]:
y_pred = voting_clf.predict(X_test_prep)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [120]:
predictions = pd.DataFrame({'PassengerId':titanic_test['PassengerId'], 'Survived':y_pred})
predictions.to_csv('data/predictions.csv')

**...............................................................................................................................................................................................................................**
**...............................................................................................................................................................................................................................**

This model did not live up to expectations and scored a 0.78 on the competetion. Let us go simple and revise feature engineering a little bit.
In the revised version, I am checking if getting rid of column can be useful. Also, I am going to ditch Ticket as well. Moreover, The family members attribute can be replaced by an IsAlone column which I think makes more sense.

In [128]:
X_train_2 = X_train.copy()[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
X_train_2

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C


In [3]:
# Finding if a person is alone
class IsAlone(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        self.feature_names_in_ = None
    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns
        return self
    def transform(self, X):
        return pd.DataFrame((X['Parch']+X['SibSp'])==0)
    def get_feature_names_out(self, inp=None):
        return ['is_alone']

In [178]:
# Pipeline for imputing and scaling the ``Age`` and ``Fare`` attribute
num_pipe = pipeline.Pipeline([('num_impute', impute.SimpleImputer(strategy='median')),
                              ('num_scale', preprocessing.StandardScaler())])

# Pipeline for Embarked attribute
emb_pipe = pipeline.Pipeline([('emb_impute', impute.SimpleImputer(strategy='most_frequent')),
                              ('emb_encode', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

# Transformer for creating new attributes - family members and title
column_creator = compose.ColumnTransformer([
                                            ('is_alone', IsAlone(), ['Parch', 'SibSp'])], remainder='passthrough')

# Final Column Transformer
column_transformer = compose.ColumnTransformer([('num_pipe', num_pipe, [4,5]),
                                                ('isalone', preprocessing.OrdinalEncoder(), [0]),
                                                ('emb_pipe', emb_pipe, [6]),
                                                ('class_encode', preprocessing.OrdinalEncoder(), [1]),
                                                ('sex_pipe', preprocessing.OrdinalEncoder(), [3])], remainder='drop', sparse_threshold=0)

# Final Pipeline
full_pipeline_2 = pipeline.Pipeline([('new_cols', column_creator),
                                   ('all_columns_trans', column_transformer)])


X_train_prep_2 = full_pipeline_2.fit_transform(X_train_2) 

In [179]:
full_pipeline_2.get_feature_names_out()

array(['num_pipe__remainder__Age', 'num_pipe__remainder__Fare',
       'isalone__is_alone__is_alone', 'emb_pipe__remainder__Embarked_C',
       'emb_pipe__remainder__Embarked_Q',
       'emb_pipe__remainder__Embarked_S',
       'class_encode__remainder__Pclass', 'sex_pipe__remainder__Sex'],
      dtype=object)

In [180]:
sgd_clf = linear_model.SGDClassifier()
knn_clf = neighbors.KNeighborsClassifier()
rf_clf = ensemble.RandomForestClassifier()
svm_clf = svm.SVC()

scores = []
for model in [sgd_clf, knn_clf, rf_clf, svm_clf]:
    scores.append(model_selection.cross_validate(model, X_train_prep_2, y_train, cv=3, scoring='accuracy'))

In [181]:
[scores[i]['test_score'].mean() for i in range(4)]

[0.787878787878788, 0.7777777777777777, 0.787878787878788, 0.8159371492704826]

In [182]:
svm_params = [{'C':[0.1, 1, 10, 100], 'kernel':['rbf', 'sigmoid'], 'gamma':['auto', 'scale']}, {'C':[0.1, 1, 10, 100], 'kernel':['poly'], 'gamma':['auto', 'scale'], 'degree':[2,3,4,5]}]
svm_grid = model_selection.GridSearchCV(svm_clf, svm_params, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)

svm_grid.fit(X_train_prep_2, y_train)

print(f'Best Score: {svm_grid.best_score_}\nBest Parameters: {svm_grid.best_params_}')

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Score: 0.8159371492704826
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


In [183]:
knn_params_2 = {
    'n_neighbors': [11, 12, 13,14],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'p': [1, 2],
    'leaf_size': [10, 20, 30]
}

knn_grid_2 = model_selection.GridSearchCV(knn_clf, knn_params_2, cv=3, n_jobs=-1, verbose=2)
knn_grid_2.fit(X_train_prep_2, y_train)

print(f'Best Score: {knn_grid_2.best_score_}\nBest Parameters: {knn_grid_2.best_params_}')

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Score: 0.7923681257014591
Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 13, 'p': 1, 'weights': 'uniform'}


In [184]:
rf_params_2 = {
    'n_estimators': [30, 40, 50, 60],
    'max_features': ['sqrt', 'log2', 'auto'],
    'max_depth': [30, 50, 70],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4, 6],
    'bootstrap': [True, False]
}

rf_clf_2 = ensemble.RandomForestClassifier()

rf_grid_2 = model_selection.GridSearchCV(rf_clf_2, rf_params_2, cv=3, verbose=1, n_jobs=-1)
rf_grid_2.fit(X_train_prep_2, y_train)

print(f'Best Score: {rf_grid_2.best_score_}\nBest Parameters: {rf_grid_2.best_params_}')

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Best Score: 0.8282828282828282
Best Parameters: {'bootstrap': False, 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 30}


432 fits failed out of a total of 1296.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
101 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\thory\miniconda3\envs\main\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\thory\miniconda3\envs\main\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\thory\miniconda3\envs\main\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\thory\miniconda3\envs\main\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_

In [185]:
rf_clf_final = ensemble.RandomForestClassifier(**rf_grid_2.best_params_)
svm_clf_final = svm.SVC(**svm_grid.best_params_, probability=True)
knn_clf_final = neighbors.KNeighborsClassifier(**knn_grid_2.best_params_)

voting_clf = ensemble.VotingClassifier(estimators=[('rf', rf_clf_final), ('svm', svm_clf_final), ('knn', knn_clf_final)], voting='soft')

voting_clf.fit(X_train_prep_2, y_train)

In [186]:
X_test_prep_2 = full_pipeline_2.transform(X_test)

In [187]:
simple_predictions = voting_clf.predict(X_test_prep_2)
predictions = pd.DataFrame({'PassengerId':titanic_test['PassengerId'], 'Survived':simple_predictions})
predictions.to_csv('data/simple_predictions.csv')

This simpler set of  12 features performed slightly better(0.78) than the previous one(0.77) with 50 features.