# Using scikit models to test againt the data 

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import cross_val_score
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier 


## Load Data (1)-- categorical

In [2]:
train_data = pd.read_csv('train_processed.csv', dtype=str, usecols=range(1,10))
train_X = train_data.drop(['Survived'], axis=1)
train_Y = train_data['Survived']
test_X = pd.read_csv('test_processed.csv', usecols=range(1,9), dtype=str)
test_ID = pd.read_csv('gender_submission.csv')['PassengerId']
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,0,2,1
1,1,1,1,2,1,0,3,0,3
2,1,3,1,1,0,0,1,2,2
3,1,1,1,2,1,0,3,2,3
4,0,3,0,2,0,0,1,2,1


## ## Load Data (2)-- one-hot encoding

In [4]:
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc_train_X = enc.fit_transform(train_X)
enc_test_X = enc.fit_transform(test_X)
enc_train_X = pd.DataFrame(enc_train_X.toarray())
enc_test_X = pd.DataFrame(enc_test_X.toarray())

## Applyig a series of ML approaches without Tuning

In [5]:
# To Sum up all the tuned models at once
# exclude gausssianClassifier because this only accepts dense and it performs awfully
from sklearn.model_selection import cross_val_score

classifiers = [
    MLPClassifier(),
    LogisticRegression(),
    SVC(probability=True),
    KNeighborsClassifier(n_neighbors=30),
    MultinomialNB(),
    Perceptron(),
    SGDClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=700)]
columns = ['Classifier', 'Accuracy(categorical)', 'Accuracy(onehot)']
lists = []
datasets = [('cat', train_X, train_Y), ('hot', enc_train_X, train_Y)]
acc_dict = {}
for cls in classifiers:
    name = cls.__class__.__name__
    for coding, X, Y in datasets:
        scores = cross_val_score(cls, X, Y, cv=10)
        acc = round(scores.mean()*100, 2)
        acc_dict[coding] = acc
    lists.append([name, acc_dict['cat'],acc_dict['hot']])
record = pd.DataFrame(lists, columns = columns)
print(record)

               Classifier  Accuracy(categorical)  Accuracy(onehot)
0           MLPClassifier                  76.79             80.03
1      LogisticRegression                  79.80             81.27
2                     SVC                  81.03             79.34
3    KNeighborsClassifier                  78.91             81.37
4           MultinomialNB                  76.78             77.56
5              Perceptron                  64.10             69.38
6           SGDClassifier                  73.94             71.72
7  DecisionTreeClassifier                  80.48             79.81
8  RandomForestClassifier                  81.49             81.72


# Now, tune each classifier to better accuracy
1. Evaluation Method is 10-fold cross validation

In [6]:
# build the final df used for showing all the records
columns = ['Classifier', 'Accuracy(categorical)', 'Accuracy(onehot)']
lists = []
datasets = [('cat', train_X, train_Y), ('hot', enc_train_X, train_Y)]
acc_dict = {}
name = 'classifierName'

## KNN: find the best K is 17

In [7]:
# KNN 
for coding, X, Y in datasets:
    res = []
    for i in range(1, 50):
        cls = KNeighborsClassifier(n_neighbors=i)
        name = cls.__class__.__name__
        scores = cross_val_score(cls, X, Y, cv=10)
        acc_cls = round(scores.mean()*100, 2)
        res.append(acc_cls)
    acc_dict[coding]=max(res)
    best_idx = res.index(acc_dict[coding])+1
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)
print("The ideal K should be %d" % best_idx)

[['KNeighborsClassifier', 81.38, 81.83]]
The ideal K should be 22


## LogisticRegression

In [54]:
# LogisticRegression
for coding, X, Y in datasets:
    cls = LogisticRegression()
    name = cls.__class__.__name__
    scores = cross_val_score(cls, X, Y, cv=10)
    acc_cls = round(scores.mean()*100, 2)
    acc_dict[coding] = acc_cls
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27]]


## SVM 

In [55]:
# SVM 
for coding, X, Y in datasets:
    cls = SVC(probability=True)
    name = cls.__class__.__name__
    scores = cross_val_score(cls, X, Y, cv=10)
    acc_cls = round(scores.mean()*100, 2)
    acc_dict[coding] = acc_cls
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34]]


## Gaussian Naive Bayes (dense vectors are required)

In [56]:
# Gaussian Naive Bayes (dense vectors are required)
cls = GaussianNB()
name = cls.__class__.__name__
scores = cross_val_score(cls, train_X, train_Y, cv=10)
acc_cls = round(scores.mean()*100, 2)
acc_dict['cat'] = acc_cls
lists.append([name, acc_dict['cat'], '--'])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34], ['GaussianNB', 74.88, '--']]


## Multinomial Naive Bayes 

In [57]:
# Multinomial Naive Bayes (this is comparable with our implemented naive bayers)
for coding, X, Y in datasets:
    cls = MultinomialNB()
    name = cls.__class__.__name__
    scores = cross_val_score(cls, X, Y, cv=10)
    acc_cls = round(scores.mean()*100, 2)
    acc_dict[coding] = acc_cls
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34], ['GaussianNB', 74.88, '--'], ['MultinomialNB', 76.78, 77.56]]


## Perceptron

In [58]:
# Perceptron (not using one hot is better)
for coding, X, Y in datasets:
    cls = Perceptron()
    name = cls.__class__.__name__
    scores = cross_val_score(cls, X, Y, cv=10)
    acc_cls = round(scores.mean()*100, 2)
    acc_dict[coding] = acc_cls
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34], ['GaussianNB', 74.88, '--'], ['MultinomialNB', 76.78, 77.56], ['Perceptron', 64.1, 69.38]]


## Stochastic Gradient Descent

In [59]:
# Stochastic Gradient Descent (score not stable)
for coding, X, Y in datasets:
    cls = SGDClassifier()
    name = cls.__class__.__name__
    scores = cross_val_score(cls, X, Y, cv=10)
    acc_cls = round(scores.mean()*100, 2)
    acc_dict[coding] = acc_cls
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34], ['GaussianNB', 74.88, '--'], ['MultinomialNB', 76.78, 77.56], ['Perceptron', 64.1, 69.38], ['SGDClassifier', 70.62, 72.83]]


## Decision Tree

In [60]:
# Decision Tree
from sklearn.model_selection import GridSearchCV
alphas = [0.46,0.1,0.2]
decisionTreeModel = DecisionTreeClassifier()
Grid = GridSearchCV(estimator=decisionTreeModel, param_grid=dict(min_impurity_split=alphas))
for coding, X, Y in datasets:
    cls = Grid
    name = decisionTreeModel.__class__.__name__
    scores = cross_val_score(cls, X, Y, cv=10)
    acc_cls = round(scores.mean()*100, 2)
    acc_dict[coding] = acc_cls
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34], ['GaussianNB', 74.88, '--'], ['MultinomialNB', 76.78, 77.56], ['Perceptron', 64.1, 69.38], ['SGDClassifier', 70.62, 72.83], ['DecisionTreeClassifier', 80.82, 81.04]]


## Random Forest
1. feature selection
2. tuning using grid search

### feature selecton: abandon useless features

In [49]:
# Feature Selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=700)
clf = clf.fit(enc_train_X, train_Y)
features = pd.DataFrame()
features['feature'] = enc_train_X.columns
features['importance'] = clf.feature_importances_
features.sort(['importance'],ascending=False)
print(features)
# by choosing threshold 0.01, we get rid of two useless features
model = SelectFromModel(clf, prefit=True, threshold=0.01)
train_new = model.transform(enc_train_X)
train_new.shape
test_new = model.transform(enc_test_X)
print('We original had 26 features for one hot encoding data. Now we only have 24 features.')
print(test_new.shape)



    feature  importance
0         0    0.051105
1         1    0.035337
2         2    0.087180
3         3    0.117864
4         4    0.133788
5         5    0.018265
6         6    0.022546
7         7    0.024186
8         8    0.015837
9         9    0.003118
10       10    0.019163
11       11    0.019156
12       12    0.019515
13       13    0.018937
14       14    0.019730
15       15    0.020703
16       16    0.025648
17       17    0.031344
18       18    0.022943
19       19    0.014744
20       20    0.024382
21       21    0.155377
22       22    0.034885
23       23    0.037883
24       24    0.016454
25       25    0.009908
We original had 26 features for one hot encoding data. Now we only have 24 features.
(418, 24)


### tuning RF 

In [50]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
forest = RandomForestClassifier(max_features='sqrt')

parameter_grid = {
                 'max_depth' : [4,5,6,7,8],
                 'n_estimators': [200,450,700],
                 'criterion': ['gini','entropy']
                 }

cross_validation = StratifiedKFold(train_Y, n_folds=10)

grid_search = GridSearchCV(forest,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(train_new, train_Y)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.826038159371
Best parameters: {'n_estimators': 700, 'criterion': 'entropy', 'max_depth': 5}


In [64]:
# Random Forest
cls = RandomForestClassifier(n_estimators = 700)
scores = cross_val_score(cls, train_X, train_Y, cv=10)
acc_cls = round(scores.mean()*100, 2)
acc_dict['cat'] = acc_cls
acc_dict['hot'] = round(grid_search.best_score_*100,2)
lists.append(['RandomForestClassifier', acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34], ['GaussianNB', 74.88, '--'], ['MultinomialNB', 76.78, 77.56], ['Perceptron', 64.1, 69.38], ['SGDClassifier', 70.62, 72.83], ['DecisionTreeClassifier', 80.82, 81.04], ['RandomForestClassifier', 80.71, 0.8260381593714927], ['MLPClassifier', 78.58, 80.48], ['RandomForestClassifier', 77.46, 82.6]]


In [68]:
# MLP
for coding, X, Y in datasets:
    cls = MLPClassifier()
    name = cls.__class__.__name__
    scores = cross_val_score(cls, X, Y, cv=10)
    acc_cls = round(scores.mean()*100, 2)
    acc_dict[coding] = acc_cls
lists.append([name, acc_dict['cat'], acc_dict['hot']])
print(lists)

[['KNeighborsClassifier', 81.38, 81.83], ['LogisticRegression', 79.8, 81.27], ['SVC', 81.03, 79.34], ['GaussianNB', 74.88, '--'], ['MultinomialNB', 76.78, 77.56], ['Perceptron', 64.1, 69.38], ['SGDClassifier', 70.62, 72.83], ['DecisionTreeClassifier', 80.82, 81.04], ['RandomForestClassifier', 80.71, 0.8260381593714927], ['MLPClassifier', 78.58, 80.48], ['RandomForestClassifier', 77.46, 82.6], ['MLPClassifier', 77.11, 79.8], ['MLPClassifier', 77.46, 79.8], ['MLPClassifier', 77.67, 79.47], ['MLPClassifier', 77.46, 80.59]]


In [63]:
record = pd.DataFrame(lists, columns = columns)
print(record)

               Classifier  Accuracy(categorical) Accuracy(onehot)
0    KNeighborsClassifier                  81.38            81.83
1      LogisticRegression                  79.80            81.27
2                     SVC                  81.03            79.34
3              GaussianNB                  74.88               --
4           MultinomialNB                  76.78            77.56
5              Perceptron                  64.10            69.38
6           SGDClassifier                  70.62            72.83
7  DecisionTreeClassifier                  80.82            81.04
8  RandomForestClassifier                  80.71         0.826038
9           MLPClassifier                  78.58            80.48


### The best Model (1) to submit: SVM with categorical dataset

In [71]:
svmModel = SVC(probability=True)
svmModel.fit(train_X, train_Y)
print('The accuracy on the training set is %f' % cross_val_score(svmModel, train_X, train_Y, cv=10).mean())
preds = svmModel.predict(test_X)
test_ID = pd.read_csv('gender_submission.csv')
submission = pd.DataFrame({
        "PassengerId": test_ID["PassengerId"],
        "Survived": preds
    })
print(submission.head())
submission.to_csv('titanic.csv', index=False)

The accuracy on the training set is 0.810346
   PassengerId Survived
0          892        0
1          893        1
2          894        0
3          895        0
4          896        1


### The best Model (2) to submit: RandomForest with one hot encoding

In [73]:
print('The accuracy on the training set is %f' % grid_search.best_score_)
model = grid_search
preds = model.predict(test_new)
test_ID = pd.read_csv('gender_submission.csv')
submission = pd.DataFrame({
        "PassengerId": test_ID["PassengerId"],
        "Survived": preds
    })
print(submission.head())
submission.to_csv('titanic.csv', index=False)

The accuracy on the training set is 0.826038
   PassengerId Survived
0          892        0
1          893        1
2          894        0
3          895        0
4          896        0


### The best Model (3) to submit: Logistic Regression with one-hot encoding

In [40]:
cls = LogisticRegression()
cls.fit(enc_train_X, train_Y)
print('The accuracy on the training set is %f' % cross_val_score(cls, enc_train_X, train_Y, cv=10).mean())
preds = cls.predict(enc_test_X)
test_ID = pd.read_csv('gender_submission.csv')
submission = pd.DataFrame({
        "PassengerId": test_ID["PassengerId"],
        "Survived": preds
    })
print(submission.head())
submission.to_csv('titanic.csv', index=False)

The accuracy on the training set is 0.812668
   PassengerId Survived
0          892        0
1          893        1
2          894        0
3          895        0
4          896        1


### The best Model (4) to submit: KNN with categorical data

In [71]:
cls = KNeighborsClassifier(n_neighbors=20)
cls.fit(train_X, train_Y)
print('The accuracy on the training set is %f' % cross_val_score(cls, train_X, train_Y, cv=10).mean())
preds = cls.predict(test_X)
test_ID = pd.read_csv('gender_submission.csv')
submission = pd.DataFrame({
        "PassengerId": test_ID["PassengerId"],
        "Survived": preds
    })
print(submission.head())
submission.to_csv('titanic.csv', index=False)

The accuracy on the training set is 0.813779
   PassengerId Survived
0          892        0
1          893        1
2          894        0
3          895        0
4          896        1


### The best Model (5) to submit: MLP with categorical data

In [75]:
from sklearn.neural_network import MLPClassifier 
model = MLPClassifier()
model.fit(train_X, train_Y)
preds = model.predict(test_X)
print('The accuracy on the training set is %f' % cross_val_score(model, train_X, train_Y, cv=10).mean())
test_ID = pd.read_csv('gender_submission.csv')
submission = pd.DataFrame({
        "PassengerId": test_ID["PassengerId"],
        "Survived": preds
    })
print(submission.head())
submission.to_csv('titanic.csv', index=False)

The accuracy on the training set is 0.777948
   PassengerId Survived
0          892        0
1          893        0
2          894        0
3          895        0
4          896        1
