In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
import itertools
import sys
import pickle
from time import time, strftime, gmtime
from collections import Counter
from operator import itemgetter
from math import factorial

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import RFECV, RFE
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.multioutput import MultiOutputRegressor

from queue import Queue
from threading import Thread, Lock

nan = np.NaN
# %matplotlib inline

In [18]:
# read csv file and show head
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
original_df = pd.concat([df_train, df_test], ignore_index=True, sort=False)

In [20]:
original_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [21]:
original_df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [23]:
original_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [22]:
original_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,1309,1309,1309,295,1307
unique,1307,2,929,186,3
top,"Kelly, Mr. James",male,CA. 2343,C23 C25 C27,S
freq,2,843,11,6,914


In [5]:
# check null values
list(zip(original_df.columns, original_df.isnull().any(), original_df.dtypes))

[('PassengerId', False, dtype('int64')),
 ('Survived', True, dtype('float64')),
 ('Pclass', False, dtype('int64')),
 ('Name', False, dtype('O')),
 ('Sex', False, dtype('O')),
 ('Age', True, dtype('float64')),
 ('SibSp', False, dtype('int64')),
 ('Parch', False, dtype('int64')),
 ('Ticket', False, dtype('O')),
 ('Fare', True, dtype('float64')),
 ('Cabin', True, dtype('O')),
 ('Embarked', True, dtype('O'))]

In [14]:
# target = original_df.Survived
dataframe = original_df.copy()

In [15]:
# Create column NameType(Mr, Mrs, ...) from Name
dataframe['NameType'] = [re.sub(r'.+?, (.+?)\..*', r'\1', name) for name in dataframe.Name]
dataframe.NameType = dataframe.NameType.replace(
    ['Sir', 'Capt', 'Major', 'Don', 'Rev', 'Jonkheer', 'Col'],
    "Mr"
)
dataframe.NameType = dataframe.NameType.replace(
    ['Mlle', 'Lady', 'Mme', 'Miss', 'Mrs', 'the Countess', 'Dona'],
    "Ms"
)

# replace row contains sex = female and nameType = Dr by Ms
dataframe.loc[(dataframe.Sex == "female") & (dataframe.NameType == "Dr"), 'NameType'] = "Ms"
# replace row contains sex = male and nameType = Dr by Mr
dataframe.loc[(dataframe.Sex == "male") & (dataframe.NameType == "Dr"), 'NameType'] = "Mr"

In [16]:
# fill the nan value of age by the mean of group of Name type
dataframe.Age = dataframe.groupby(["NameType"]).transform(lambda a: a.fillna(a.mean())).Age

In [17]:
# create family size
dataframe['FamilySize'] = dataframe.SibSp + dataframe.Parch

In [18]:
# create TicketNum from Ticket
# replace Ticket "LINE" by "3" (all ticket "LINE" is Pclass 3)
dataframe['TicketNum'] = [
    re.sub(r'.+?\b(\d+)$', r'\1', re.sub(r'LINE', r'3', ticket)) for ticket in dataframe.Ticket
]
# create TicketNumLen : the nbr of digits in TicketNum
dataframe['TicketNumLen'] = dataframe.TicketNum.apply(lambda tn: len(str(tn)))
# create TicketNumDigitStart : the digit which the ticket num starts
dataframe['TicketNumDigitStart'] = dataframe.TicketNum.apply(lambda tn: str(tn)[0])
# change type TicketNum to numeric
dataframe.TicketNum = dataframe.TicketNum.astype(float)
dataframe.TicketNumDigitStart = dataframe.TicketNumDigitStart.astype(float)

In [19]:
# fillna Embarked by get the closet point of notnull from the nullpoint
columns = ['Pclass', 'Fare', 'TicketNum']
df_embarked_isnull = dataframe[dataframe.Fare.notnull() & dataframe.Embarked.isnull()]
indexes_embarked_isnull = df_embarked_isnull.index.values
df_embarked_notnull = dataframe[dataframe.Fare.notnull() & dataframe.Embarked.notnull()].reset_index(drop=True)
eds = euclidean_distances(df_embarked_isnull[columns].values, df_embarked_notnull[columns].values)
indexes_min_dist_embarked = [min(enumerate(ed), key=itemgetter(1))[0] for ed in eds]
dataframe.loc[indexes_embarked_isnull, 'Embarked'] = [
    df_embarked_notnull.at[index, 'Embarked'] for index in indexes_min_dist_embarked]

In [20]:
# fill the nan value of Fare by the mean of group of 
# Pclass, Embarked, TicketNumLen, TicketNumDigitStart, FamilySize
dataframe.Fare = dataframe.groupby(
    ['Pclass', 'Embarked', 'TicketNumLen', 'TicketNumDigitStart', 'FamilySize']
)['Fare'].transform(lambda f: f.fillna(f.mean()))

In [21]:
# Create Fare per person
dataframe['FarePerPerson'] = dataframe.Fare / (dataframe.FamilySize + 1)

In [22]:
# create the count of Cabin
dataframe['CabinCount'] = dataframe.Cabin.apply(lambda cs: [nan, len(str(cs).strip().split())][cs is not nan])
# Create Column and take the first letter of the cabin which contains a number of cabin
dataframe['CabinCode'] = dataframe.Cabin.apply(
    lambda cs: [
        nan,
        sorted(str(cs).strip().split(), key=len, reverse=True)[0][0]
    ][cs is not nan]
)
# create the number of Cabin (nbr after the letter) (if exist one or more of cabins so get the first else 0)
dataframe['NbrOfCabin'] = dataframe.Cabin.apply(
    lambda cs: [nan, list(filter(lambda c: len(c) > 1, str(cs).strip().split()))][cs is not nan]
)
dataframe.NbrOfCabin = [
    np.mean([int(c[1:]) for c in cs]) if (cs is not nan) and (cs != []) else nan 
    for cs in dataframe.NbrOfCabin
]
# dataframe[dataframe.Cabin.notnull()][
#     ['CabinCode', 'CabinCount', 'NbrOfCabin', 'Cabin', 'Pclass', 'FamilySize', 'FarePerPerson',
#      'Embarked', 'TicketNum', 'TicketNumLen', 'TicketNumDigitStart']]

In [52]:
## fillna of CabinCode by create a classification model
start_time = time()
data = dataframe[dataframe.CabinCode.notnull()][[
    'CabinCode', 'Pclass', 'FamilySize', 'FarePerPerson', 'Fare',
    'Embarked', 'TicketNum', 'TicketNumLen', 'TicketNumDigitStart'
]]
data_x = data.drop(['CabinCode'], 1)
y = data.CabinCode.values

object_cols = data_x.select_dtypes("object").columns
# Convert the categorical data to numbers
label_encoders = {}
for obj_col in object_cols:
    label_encoder = LabelEncoder()
    label_encoder.fit(data_x[obj_col])
    data_x[[obj_col]] = data_x[[obj_col]].apply(label_encoder.transform)
    label_encoders[obj_col] = label_encoder

classifiers = {
    "Logistic Regression": [LogisticRegression, {'random_state': 0}],
    "KNN": [KNeighborsClassifier,
            {'n_neighbors': 5, 'metric': 'minkowski', 'p': 2}],
    "SVM rbf": [SVC, {'kernel': 'rbf', 'random_state': 0}],
#     "SVM poly": [SVC, {'kernel': 'poly', 'random_state': 0}],
    "SVM sigmoid": [SVC, {'kernel': 'sigmoid', 'random_state': 0}],
#     "SVM precomputed": [SVC, {'kernel': 'precomputed', 'random_state': 0}],
#     "SVM linear": [SVC, {'kernel': 'linear', 'random_state': 0}],
    "Naive Bayes": [GaussianNB, {}],
    "Decision Tree": [DecisionTreeClassifier, {'criterion': "entropy",
                                               'random_state': 0}],
    "Random Forest": [RandomForestClassifier,
                      {'n_estimators': 10,
                       'criterion': 'entropy',
                       'random_state': 0}]
}
len_algorithms = classifiers.__len__()
# count of all columns in data
len_cols = data_x.columns.__len__()
max_nbr_f = data_x.columns.__len__()
min_nbr_f = max_nbr_f // 5 + 1
nbr_tests = sum([
    factorial(len_cols) / factorial(i) / factorial(len_cols - i) for i in range(min_nbr_f, max_nbr_f + 1)
]) * len_algorithms
columns_train = [
    [col for index, col in enumerate(data_x.columns) if index in indexes_cols]
    for nbr_features in range(max_nbr_f, min_nbr_f - 1, -1)
    for indexes_cols in itertools.combinations(range(len_cols), nbr_features)
]
current_train = 0
# use K fold
kf = KFold(n_splits=5)

cl_cabin_code = {'accuracy': -1, 'features': data_x.columns}

# generate classifiers
for columns in columns_train:
    if len(cl_cabin_code['features']) - len(columns) == 2:
        break
        
    X = data_x[columns].values.astype(np.float64)

    for name_classifier, [classifier, params] in classifiers.items():
#             print(name_classifier)
        # Start generate model
        ml_classifier = classifier(**params)
        scores = cross_val_score(ml_classifier, X, y, cv=kf)    
        # save the result
        max_accuracy = max(scores)
        # print the progression
        current_train += 1
        progress_value = current_train * 100. / nbr_tests
        sys.stdout.write("\r")
        sys.stdout.write("Progression |%-50s| %.2f %% (%s)" %
                         ("\u2588" * int(progress_value / 2.),
                          progress_value,
                          strftime("%H:%M:%S", gmtime(time() - start_time))
                          )
                         )
        sys.stdout.flush()
        
        if (max_accuracy > cl_cabin_code['accuracy']) or (
            (max_accuracy == cl_cabin_code['accuracy']) and (len(columns) < len(cl_cabin_code['features']))
        ):
            cl_cabin_code = {
                "x_y": [X, y],
#                 "dummy_var": onehotencoder,
#                 "feature_scaling": sc,
                "classifier": ml_classifier,
                "features": list(columns),
                "algorithm": name_classifier,
                "accuracy": max_accuracy
            }
    
# sort and view results
print("\nThe best classifier is %s" % cl_cabin_code['algorithm'])
print("The accuracy : %.2f %%" % (cl_cabin_code['accuracy'] * 100.))
print("The columns : %s" % cl_cabin_code['features'])
# print("The duration of execution : %s" %
#       strftime("%H hours %M minutes %S seconds", gmtime(end_time - start_time))
#       )
print("Number of tests = %d / %d tests" % (current_train, nbr_tests))        
# add the test data to best classifier
X, y = cl_cabin_code['x_y']
cl_cabin_code['classifier'].fit(X, y)

# save label encoders
cl_cabin_code['label_encoders'] = label_encoders

# Save best classifier to pickle as dictionary
with open('classifiers/cl_cabin_code.pik', 'wb') as wp:
    pickle.dump(cl_cabin_code, wp)

Progression |████████████████████████████████                  | 65.99 % (00:00:44)
The best classifier is Decision Tree
The accuracy : 81.36 %
The columns : ['Pclass', 'FamilySize', 'Fare', 'TicketNum', 'TicketNumLen']
Number of tests = 1141 / 1729 tests


In [54]:
print(dataframe.CabinCode.isnull().any())
# fillna CabinCode of dataframe 
print("Predict with %s" % cl_cabin_code['algorithm'])

df = dataframe[dataframe.CabinCode.isnull()][cl_cabin_code['features']]
object_cols = df.select_dtypes("object").columns
# Convert the categorical data to numbers
label_encoders = cl_cabin_code['label_encoders']
for obj_col in object_cols:
    df[[obj_col]] = df[[obj_col]].apply(label_encoders[obj_col].transform)

X = df.values.astype(np.float64)
# predict with classifier
y = cl_cabin_code['classifier'].predict(X)

# dataframe.CabinCode = dataframe.CabinCode.replace([nan] * len(y), y)
indexes_cabin_code_isnull = dataframe[dataframe.CabinCode.isnull()].index
dataframe.loc[indexes_cabin_code_isnull, 'CabinCode'] = y
print(dataframe.CabinCode.isnull().any())
print('Finished.')

True
Predict with Decision Tree
False
Finished.


In [55]:
# Fill the nbr of cabin to 1 for cabin has 1 cabin in data
dataframe.loc[dataframe.CabinCount.notnull() & dataframe.NbrOfCabin.isnull(), 'NbrOfCabin'] = 1

In [60]:
## fillna of CabinCount and NbrOfCabin by create a multi output regression model
start_time = time()
data = dataframe[dataframe.CabinCount.notnull()][[
    'CabinCode', 'CabinCount', 'NbrOfCabin', 'Pclass', 'FamilySize', 'FarePerPerson', 'Fare',
    'Embarked', 'TicketNum', 'TicketNumLen', 'TicketNumDigitStart'
]]
data_x = data.drop(['CabinCount', 'NbrOfCabin'], 1)
y = data[['CabinCount', 'NbrOfCabin']].values

object_cols = data_x.select_dtypes("object").columns
# Convert the categorical data to numbers
label_encoders = {}
for obj_col in object_cols:
    label_encoder = LabelEncoder()
    label_encoder.fit(data_x[obj_col])
    data_x[[obj_col]] = data_x[[obj_col]].apply(label_encoder.transform)
    label_encoders[obj_col] = label_encoder

regressors = {
#     "Polynomial Resgression": [PolynomialFeatures, {}],
    "SVR rbf": [SVR, {'kernel': 'rbf'}],
#     "SVR poly": [SVR, {'kernel': 'poly'}],
#     "SVR sigmoid": [SVR, {'kernel': 'sigmoid'}],
    "Decision Tree": [DecisionTreeRegressor, {'criterion': "mse",
                                              'random_state': 0}],
    "Random Forest": [RandomForestRegressor,
                      {'n_estimators': 10,
                       'criterion': 'mse',
                       'random_state': 0}]
}
len_algorithms = regressors.__len__()
# count of all columns in data
len_cols = data_x.columns.__len__()
max_nbr_f = data_x.columns.__len__()
min_nbr_f = max_nbr_f // 5 + 1
nbr_tests = sum([
    factorial(len_cols) / factorial(i) / factorial(len_cols - i) for i in range(min_nbr_f, max_nbr_f + 1)
]) * len_algorithms
columns_train = [
    [col for index, col in enumerate(data_x.columns) if index in indexes_cols]
    for nbr_features in range(max_nbr_f, min_nbr_f - 1, -1)
    for indexes_cols in itertools.combinations(range(len_cols), nbr_features)
]
current_train = 0

# use K fold
kf = KFold(n_splits=5)

reg_cabin_count_nbr = {'accuracy': -1, 'features': data_x.columns}

# generate classifiers
for columns in columns_train:
    if len(reg_cabin_count_nbr['features']) - len(columns) == 2:
        break
        
    X = data_x[columns].values.astype(np.float64)

    # Feature Scaling
    sc_x = StandardScaler()
    sc_y = StandardScaler()
    X = sc_x.fit_transform(X)
    y_sc = sc_y.fit_transform(y)

    for name_regressor, [regressor, params] in regressors.items():
        # Start generate multi output model
        ml_regressor = MultiOutputRegressor(regressor(**params))
        scores = cross_val_score(ml_regressor, X, y_sc, scoring='neg_mean_squared_error', cv=kf)  
        # save the result
        max_accuracy = max(scores)
        # print the progression
        current_train += 1
        progress_value = current_train * 100. / nbr_tests
        sys.stdout.write("\r")
        sys.stdout.write("Progression |%-50s| %.2f %% (%s)" %
                         ("\u2588" * int(progress_value / 2.),
                          progress_value,
                          strftime("%H:%M:%S", gmtime(time() - start_time))
                          )
                         )
        sys.stdout.flush()

        if (max_accuracy > reg_cabin_count_nbr['accuracy']) or (
            (max_accuracy == reg_cabin_count_nbr['accuracy']) and (
                len(columns) < len(reg_cabin_count_nbr['features']))
        ):
            reg_cabin_count_nbr = {
                "x_y": [X, y_sc],
#                 "dummy_var": onehotencoder,
                "feature_scaling_x": sc_x,
                "feature_scaling_y": sc_y,
                "regressor": ml_regressor,
                "features": list(columns),
                "algorithm": name_regressor,
                "accuracy": max_accuracy
            }

print("\nThe best regressor is %s" % reg_cabin_count_nbr['algorithm'])
print("The accuracy : %.2f %%" % (100 + reg_cabin_count_nbr['accuracy'] * 100.))
print("The columns : %s" % reg_cabin_count_nbr['features'])
# print("The duration of execution : %s" %
#       strftime("%H hours %M minutes %S seconds", gmtime(end_time - start_time))
#       )
print("Number of tests = %d / %d tests" % (current_train, nbr_tests))        
# add the test data to best classifier
X, y = reg_cabin_count_nbr['x_y']
reg_cabin_count_nbr['regressor'].fit(X, y)

# save label encoders
reg_cabin_count_nbr['label_encoders'] = label_encoders

# Save best classifier to pickle as dictionary
with open('classifiers/reg_cabin_count_nbr.pik', 'wb') as wp:
    pickle.dump(reg_cabin_count_nbr, wp)

Progression |████████████                                      | 25.90 % (00:00:32)
The best regressor is Decision Tree
The accuracy : 90.63 %
The columns : ['CabinCode', 'Pclass', 'FamilySize', 'FarePerPerson', 'Embarked', 'TicketNum', 'TicketNumDigitStart']
Number of tests = 390 / 1506 tests


In [61]:
print(dataframe.CabinCount.isnull().any())
# fillna CabinCode of dataframe 
print("Predict with %s" % reg_cabin_count_nbr['algorithm'])

df = dataframe[dataframe.CabinCount.isnull()][reg_cabin_count_nbr['features']]
object_cols = df.select_dtypes("object").columns
# Convert the categorical data to numbers
label_encoders = reg_cabin_count_nbr['label_encoders']
for obj_col in object_cols:
    df[[obj_col]] = df[[obj_col]].apply(label_encoders[obj_col].transform)

X = df.values.astype(np.float64)
# # use dummy variables for columns of types object
# if reg_cabin_count_nbr['dummy_var']:
#     X = reg_cabin_count_nbr['dummy_var'].transform(X).toarray()

# Feature Scaling
X = reg_cabin_count_nbr['feature_scaling_x'].transform(X)

# predict with regressor and reverse feature scaling
y = reg_cabin_count_nbr['feature_scaling_y'].inverse_transform(reg_cabin_count_nbr['regressor'].predict(X))

indexes_cabin_count_isnull = dataframe[dataframe.CabinCount.isnull()].index
dataframe.loc[indexes_cabin_count_isnull, ['CabinCount', 'NbrOfCabin']] = y
print(dataframe.CabinCount.isnull().any())
print('Finished.')

True
Predict with Decision Tree
False
Finished.


In [62]:
# transform Sex to 'M' and 'F'
dataframe.Sex = dataframe.Sex.transform(lambda s: s[0])
# compute size of male and female in familySize + 1 by groupby of Ticket
df_sum_sex = dataframe.groupby(['TicketNum', 'TicketNumLen', 'TicketNumDigitStart']).Sex.transform('sum')
dataframe['MaleCount'] = df_sum_sex.transform(lambda s: s.count('m'))
dataframe['FemaleCount'] = df_sum_sex.transform(lambda s: s.count('f'))

In [63]:
# make diff that 1st class is better than 2nd class and 3rd class
dataframe['PclassPower'] = 3. / dataframe.Pclass

In [64]:
# PclassPower and Fare
dataframe['PclassPowerFare'] = dataframe.PclassPower * dataframe.Fare

In [65]:
# person per Cabin 
dataframe['PersonPerCabin'] = (dataframe.FamilySize + 1) / dataframe.CabinCount

In [66]:
# Fare of cabin
dataframe['FarePerCabin'] = dataframe.Fare / dataframe.CabinCount

In [67]:
dataframe.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True)

In [68]:
# save the dataframe
with open('classifiers/dataframe_titanic.pik', 'wb') as wp:
    pickle.dump(dataframe, wp)

In [96]:
# generate pickle file which saves information for starting training 
df = dataframe.drop(['Survived'], 1)
len_cols = len(df.columns)
max_nbr_f = len_cols
min_nbr_f = max_nbr_f // 5 + 1
len_algorithms = 6
nbr_tests = sum([
    factorial(len_cols) / factorial(i) / factorial(len_cols - i) for i in range(min_nbr_f, max_nbr_f + 1)
]) * len_algorithms
columns_train = [
    [col for index, col in enumerate(df.columns) if index in indexes_cols]
    for nbr_features in range(max_nbr_f, min_nbr_f - 1, -1)
    for indexes_cols in itertools.combinations(range(len_cols), nbr_features)
]
infos = {
    'nbr_of_trains': nbr_tests,
    'columns_train': columns_train,
    'nbr_cols_done': 35443
}
# save the infos
with open('classifiers/infos.pik', 'wb') as wp:
    pickle.dump(infos, wp)

17

In [69]:
# After load the packages, start from here (ignore data preprocessing)

In [2]:
with open('classifiers/dataframe_titanic.pik', 'rb') as rp:
    dataframe = pickle.load(rp)

In [3]:
df_prep = dataframe
# get the cols of type "object"
object_cols = df_prep.select_dtypes("object").columns
# Convert the categorical data to numbers
label_encoders = {}
for obj_col in object_cols:
    label_encoder = LabelEncoder()
    label_encoder.fit(df_prep[obj_col])
    df_prep[[obj_col]] = df_prep[[obj_col]].apply(label_encoder.transform)
    label_encoders[obj_col] = label_encoder

In [5]:
X_train = df_prep.drop(['Survived'], 1)[:891].values.astype(np.float64)
X_test = df_prep.drop(['Survived'], 1)[891:].values.astype(np.float64)
y_train = df_prep[:891].Survived.values
# Applying PCA
# We declare n_components = None to find n_componenets by view explained_variance_ratio_
# then set the nbr 
from sklearn.decomposition import PCA
pca = PCA(n_components=15)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
pca.explained_variance_ratio_

array([9.99999931e-01, 6.43825599e-08, 2.14996993e-09, 1.17170709e-09,
       4.01908731e-10, 2.78368187e-10, 1.07089918e-10, 5.87058521e-12,
       4.16331480e-12, 2.50661982e-12, 2.21458436e-12, 1.43217804e-12,
       1.11400652e-12, 9.37836783e-13, 6.86510923e-13])

In [17]:
start_time = time()
# generate the classifiers with PCA algorithm
# key : name of algorithm
# value : list of size 2 which contains algorithm and params of algo
classifiers = {
    "Logistic Regression": [LogisticRegression, {'random_state': 0}],
    "KNN": [KNeighborsClassifier,
            {'n_neighbors': 1, 'metric': 'minkowski', 'p': 2}],
    "SVM rbf": [SVC, {'kernel': 'rbf', 'random_state': 0}],
#     "SVM poly": [SVC, {'kernel': 'poly', 'random_state': 0}],
    "SVM sigmoid": [SVC, {'kernel': 'sigmoid', 'random_state': 0}],
#     "SVM precomputed": [SVC, {'kernel': 'precomputed', 'random_state': 0}],
#     "SVM linear": [SVC, {'kernel': 'linear', 'random_state': 0}],
    "Naive Bayes": [GaussianNB, {}],
    "Decision Tree": [DecisionTreeClassifier, {'criterion': "entropy",
                                               'random_state': 0}],
    "Random Forest": [RandomForestClassifier,
                      {'n_estimators': 10,
                       'criterion': 'entropy',
                       'random_state': 0}]
}
# generate threads for multiple ML algorithms
len_algorithms = classifiers.__len__()
# count of all columns in data
len_cols = len(df_prep.columns)
# use K fold
kf = KFold(n_splits=9)

results = []
best_classifier = {'accuracy': -1}

# generate classifiers
for name_classifier, [classifier, params] in classifiers.items():
    ml_classifier = classifier(**params)
    scores = cross_val_score(ml_classifier, X_train, y_train, cv=kf)
    # save the result
    # print(name_classifier)
    results.append({
        "classifier": ml_classifier,
        "algorithm": name_classifier,
        "accuracy": np.mean(scores)
    })

    # print the progression
    progress_value = results.__len__() * 100. / len_algorithms
    sys.stdout.write("\r")
    sys.stdout.write("Progression |%-50s| %.2f %% (%d / %d) (%s)" %
                     ("\u2588" * int(progress_value / 2.),
                      progress_value,
                      results.__len__(), len_algorithms,
                      strftime("%H:%M:%S", gmtime(time() - start_time))
                      )
                     )
    sys.stdout.flush()

# sort and view results
best_classifier = max(results, key=lambda d: d['accuracy'])

end_time = time()

print("\nThe best classifier is %s" % best_classifier['algorithm'])
print("The accuracy : %.2f %%" % (best_classifier['accuracy'] * 100.))
print("The duration of execution : %s" %
      strftime("%H hours %M minutes %S seconds", gmtime(end_time - start_time))
      )
print("Number of tests = %d / %d tests" % (results.__len__(), len_algorithms))

# add the test data to best classifier
best_classifier['classifier'].fit(X_train, y_train)
best_classifier['pca'] = pca
# save label encoders
best_classifier['label_encoders'] = label_encoders

# Save best classifier to pickle as dictionary
with open('classifiers/best_classifier_pca.pik', 'wb') as wp:
    pickle.dump(best_classifier, wp)

Progression |██████████████████████████████████████████████████| 100.00 % (7 / 7) (00:00:01)
The best classifier is Random Forest
The accuracy : 79.24 %
The duration of execution : 00 hours 00 minutes 01 seconds
Number of tests = 7 / 7 tests


In [28]:
# feature selection
    
df_prep = dataframe
# get the list of target Survived
y = df_prep[:891].Survived.values
df_prep = df_prep.drop(['Survived'], axis=1)

# get the cols of type "object"
object_cols = df_prep.select_dtypes("object").columns
# Convert the categorical data to numbers
label_encoders = {}
for obj_col in object_cols:
    label_encoder = LabelEncoder()
    label_encoder.fit(df_prep[obj_col])
    df_prep[[obj_col]] = df_prep[[obj_col]].apply(label_encoder.transform)
    label_encoders[obj_col] = label_encoder
    
classifiers = {
    "Logistic Regression": [LogisticRegression, {'random_state': 0}],
#     "KNN": [KNeighborsClassifier,
#             {'n_neighbors': 1, 'metric': 'minkowski', 'p': 2}],
#     "SVM rbf": [SVC, {'kernel': 'rbf', 'C': 1, 'random_state': 0}],
    # "SVM poly": [SVC, {'kernel': 'poly', 'random_state': 0}],
    # "SVM sigmoid": [SVC, {'kernel': 'sigmoid', 'random_state': 0}],
    # "SVM precomputed": [SVC, {'kernel': 'precomputed', 'random_state': 0}],
#     "SVM linear": [SVC, {'kernel': 'linear', 'C': 1, 'random_state': 0}],
#     "Naive Bayes": [GaussianNB, {}],
    "Decision Tree": [DecisionTreeClassifier, {'criterion': "entropy",
                                               'random_state': 0}],
    "Random Forest": [RandomForestClassifier,
                      {'n_estimators': 10,
                       'criterion': 'entropy',
                       'random_state': 0}]
}

len_algorithms = classifiers.__len__()
# use K fold
kf = KFold(n_splits=9)

x = df_prep[:891].values.astype(np.float64)

count_support = {}

for name_classifier, [classifier, params] in classifiers.items():
    selector = RFECV(estimator=classifier(**params), step=1, cv=kf, scoring='accuracy')
    selector = selector.fit(x, y)
    columns = list(df_prep.columns[selector.support_])
    count_support[name_classifier] = len(columns)
    print(name_classifier)
    print('-' * len(name_classifier))
    print("Columns :", columns)
    print("Len of features : %d columns" % len(columns))
    print()
    
print("Count of supports :", count_support)

Logistic Regression
-------------------
Columns : ['Pclass', 'Sex', 'SibSp', 'NameType', 'FamilySize', 'TicketNumDigitStart', 'CabinCount', 'PclassPower', 'PersonPerCabin']
Len of features : 9 columns

Decision Tree
-------------
Columns : ['NameType', 'TicketNum', 'FarePerPerson']
Len of features : 3 columns

Random Forest
-------------
Columns : ['Sex', 'Age', 'Fare', 'NameType', 'FamilySize', 'TicketNum', 'TicketNumDigitStart', 'FarePerPerson', 'CabinCode', 'NbrOfCabin', 'MaleCount', 'FemaleCount', 'PclassPower', 'PclassPowerFare', 'FarePerCabin']
Len of features : 15 columns

Count of supports : {'Logistic Regression': 9, 'Decision Tree': 3, 'Random Forest': 15}


In [None]:
start_time = time()
df_prep = dataframe[:891]
# get the list of target Survived
y = df_prep.Survived.values
df_prep = df_prep.drop(['Survived'], axis=1)

# get the cols of type "object"
object_cols = df_prep.select_dtypes("object").columns
# Convert the categorical data to numbers
label_encoders = {}
for obj_col in object_cols:
    label_encoder = LabelEncoder()
    label_encoder.fit(df_prep[obj_col])
    df_prep[[obj_col]] = df_prep[[obj_col]].apply(label_encoder.transform)
    label_encoders[obj_col] = label_encoder
# df_prep[object_cols] = df_prep[object_cols].apply(LabelEncoder().fit_transform)

# generate the classifiers without PCA Algorithm
# key : name of algorithm
# value : list of size 2 which contains algorithm and params of algo
classifiers = {
#     "Logistic Regression": [LogisticRegression, {'random_state': 0}],
#     "KNN": [KNeighborsClassifier,
#             {'n_neighbors': 1, 'metric': 'minkowski', 'p': 2}],
#     "SVM rbf": [SVC, {'kernel': 'rbf', 'random_state': 0}],
    # "SVM poly": [SVC, {'kernel': 'poly', 'random_state': 0}],
    # "SVM sigmoid": [SVC, {'kernel': 'sigmoid', 'random_state': 0}],
    # "SVM precomputed": [SVC, {'kernel': 'precomputed', 'random_state': 0}],
    # "SVM linear": [SVC, {'kernel': 'linear', 'random_state': 0}],
#     "Naive Bayes": [GaussianNB, {}],
#     "Decision Tree": [DecisionTreeClassifier, {'criterion': "entropy",
#                                                'random_state': 0}]
    "Random Forest": [RandomForestClassifier,
                      {'n_estimators': 10,
                       'criterion': 'entropy',
                       'random_state': 0}]
}
# generate threads for multiple ML algorithms
len_algorithms = classifiers.__len__()
# count of all columns in data
len_cols = len(df_prep.columns)
# max_nbr_f = df_prep.columns.__len__()
# max_nbr_f = 3
# min_nbr_f = max_nbr_f // 5 + 1
# start_nbr_f = start_nbr_f if start_nbr_f != 0 else 1
nbr_tests = sum(
    len(list(itertools.combinations(range(len_cols), nbr_features))) for
    nbr_features in [nbrf for namec, nbrf in count_support.items() if namec in classifiers.keys()]
)
# use K fold
kf = KFold(n_splits=9)

results = []
best_classifier = {'accuracy': -1}

# generate classifiers
for name_classifier, [classifier, params] in classifiers.items():
    for indexes_cols in itertools.combinations(range(len_cols), count_support[name_classifier]):
        columns = [col for index, col in enumerate(df_prep.columns) if index in indexes_cols]
        x = df_prep[columns].values.astype(np.float64)
        
        ml_classifier = classifier(**params)
        scores = cross_val_score(ml_classifier, x, y, cv=kf)
        # save the result
        # print(name_classifier)
        results.append({
            "x_y": [x, y],
#                 "dummy_var": onehotencoder,
#                 "feature_scaling": sc,
            "classifier": ml_classifier,
            "features": list(columns),
            "algorithm": name_classifier,
            "accuracy": max(scores)
        })

        # print the progression
        progress_value = results.__len__() * 100. / nbr_tests
        sys.stdout.write("\r")
        sys.stdout.write("Progression |%-50s| %.2f %% (%d / %d) (%s)" %
                         ("\u2588" * int(progress_value / 2.),
                          progress_value,
                          results.__len__(), nbr_tests,
                          strftime("%H:%M:%S", gmtime(time() - start_time))
                          )
                         )
        sys.stdout.flush()

# sort and view results
max_accuracy = max([d['accuracy'] for d in results])
best_classifier = min(
    filter(lambda d: d['accuracy'] == max_accuracy, results),
    key=lambda d: len(d['features'])
)

end_time = time()

print("\nThe best classifier is %s" % best_classifier['algorithm'])
print("The accuracy : %.2f %%" % (best_classifier['accuracy'] * 100.))
print("The columns : %s" % best_classifier['features'])
print("The duration of execution : %s" %
      strftime("%H hours %M minutes %S seconds", gmtime(end_time - start_time))
      )
print("Number of tests = %d / %d tests" % (results.__len__(), nbr_tests))

# add the test data to best classifier
x, y = best_classifier['x_y']
best_classifier['classifier'].fit(x, y)

# save label encoders
best_classifier['label_encoders'] = label_encoders

# Save best classifier to pickle as dictionary
with open('best_classifier.pik', 'wb') as wp:
    pickle.dump(best_classifier, wp)