In [42]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score
import warnings
import src.Utils as utils
from sklearn import metrics

import sklearn.ensemble as ske
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

#hack to avoid showing deprecationg warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

# Environment settings
data_path = 'Data/'

# Deserialize previously saved data from "preprocessing"
with open(data_path+'train_pp.obj', 'rb') as train_pp, \
open(data_path+'test_pp.obj', 'rb') as test_pp:
    train_df = pickle.load(train_pp)
    test_df = pickle.load(test_pp)

In [43]:
#Separate training set from target variables
passengerIds = test_df['PassengerId'].values
train_df=train_df.drop(['PassengerId'],axis=1)
test_df=test_df.drop(['PassengerId'],axis=1)
X,y = utils.train_test_separator(train_df)
#Create training set and test set with split 0.3
X_train , X_test ,y_train,y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=1, 
                                                    stratify=y)

In [44]:
model_rf = ske.RandomForestClassifier(n_estimators=50,
                                criterion='entropy',
                                max_depth = 5,
                                max_features = 6,
                                bootstrap=True).fit(X_train,y_train)

model_svm= svm.SVC(C=100,
                  gamma=0.01,
                  kernel='rbf').fit(X_train,y_train)

model_log=LogisticRegression().fit(X_train,y_train)

# voting_clf = VotingClassifier(estimators=[('Random Forest', model_rf), 
#                                          ('Support Vector machine', model_svm), 
#                                          ('Logistic Regression', model_log)], 
#                              voting='hard').fit(X,y)

models={'Random Forest': model_rf,
        'Support Vector Machine': model_svm,
        'Logistic Regression': model_log}


In [45]:
pred_rf = model_rf.predict(X)
pred_svm = model_svm.predict(X)
pred_log = model_log.predict(X)

print("RF score: {}".format(metrics.accuracy_score(y,pred_rf)))
print("SVM score: {}".format(metrics.accuracy_score(y,pred_svm)))
print("LR score: {}".format(metrics.accuracy_score(y,pred_log)))

pred_rf = pred_rf.astype(int)
pred_svm = pred_svm.astype(int)
pred_log = pred_log.astype(int)

p_avg = []
for i in range(len(y)):
    avg = (pred_rf[i]+pred_svm[i]+pred_log[i])/3
    if avg >= 0.50:
        p_avg.append(1)
    else:
        p_avg.append(0)

print("Ensemble score: {}".format(metrics.accuracy_score(y,p_avg)))


RF score: 0.867564534231201
SVM score: 0.8451178451178452
LR score: 0.8125701459034792
Ensemble score: 0.8462401795735129


In [47]:
#Prediction on real test set using random forest
model_rf.fit(X,y)
model_svm.fit(X,y)
model_log.fit(X,y)

pred_rf = model_rf.predict(test_df).astype(int)
pred_svm = model_svm.predict(test_df).astype(int)
pred_log = model_log.predict(test_df).astype(int)

ens_avg = []
for i in range(len(test_df)):
    avg = (pred_rf[i]+pred_svm[i]+pred_log[i])/3
    if avg >= 0.60:
        ens_avg.append(1)
    else:
        ens_avg.append(0)

result_df = pd.DataFrame(data={'PassengerId': passengerIds,
                               'Survived': ens_avg})
result_df['Survived'] = [0 if x == False else 1 for x in result_df['Survived']]
#Create output csv file
result_df.to_csv(data_path+"outputs/Ensemble_rf_svm_log.csv", index=False)