# Ensemble Methods 

In [2]:
%matplotlib inline

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
import seaborn as sns



# Load data

In [4]:
DT=pd.read_csv('titanic_train.csv')


In [5]:
DT.head(n=5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
DT.shape

(891, 12)

In [7]:
DT.drop(['Name','PassengerId','Cabin','Embarked','Ticket'],1,inplace=True)

In [8]:
DT.head(n=5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


# Prepare data for machine learning algorithm

In [9]:
DT.dropna(axis=0,inplace=True)

In [10]:
x=DT.drop(['Survived','SibSp','Parch'],1)
y=DT['Survived']

# Dummie variable

In [11]:
sex = pd.get_dummies(DT['Sex'],drop_first=True)
x.drop(['Sex'],axis=1,inplace=True)
x=pd.concat([x, sex], axis=1)

In [12]:
x.head(n=5)

Unnamed: 0,Pclass,Age,Fare,male
0,3,22.0,7.25,1
1,1,38.0,71.2833,0
2,3,26.0,7.925,0
3,1,35.0,53.1,0
4,3,35.0,8.05,1


# Cross validation

In [13]:
from sklearn import model_selection
kfold = model_selection.KFold(n_splits=10, random_state=3)

#Provides train/test indices to split data in train/test sets. 
#Split dataset into k consecutive folds (without shuffling by default).

#Each fold is then used once as a validation while the k - 1 remaining folds form the training set.

# Decision Tree
Apply the decision tree and present the accuracy

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
dtree = DecisionTreeClassifier()

In [None]:
#create object dtree use the  fit method using the x train and y train data asets

In [17]:
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
#Evaluate the decision tree
predictions = dtree.predict(X_test)

In [19]:
from sklearn.metrics import classification_report,confusion_matrix

In [20]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.79      0.86      0.82       132
          1       0.74      0.64      0.68        83

avg / total       0.77      0.77      0.77       215



In [None]:
#default decision tree shows about 79% and 74% precision on correctly classifying survived and not survived


# Bagging Tree
Apply Bagging tree and present the accuracy


In [22]:
from sklearn.ensemble import BaggingClassifier

In [23]:
dtreeBag = BaggingClassifier()

In [24]:
dtreeBag.fit(X_train,y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [25]:
predictionsBag = dtreeBag.predict(X_test)

In [26]:
print(classification_report(y_test,predictionsBag))

             precision    recall  f1-score   support

          0       0.78      0.88      0.83       132
          1       0.76      0.60      0.67        83

avg / total       0.77      0.77      0.77       215



In [None]:
#The bagging method did not give any significant diffrence in the prediction

# Random Forest

Apply the Random Forest and present the accuracy

In [49]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
rfc_pred = rfc.predict(X_test)

In [51]:
print(confusion_matrix(y_test,rfc_pred))

[[114  18]
 [ 25  58]]


In [52]:
print(classification_report(y_test,rfc_pred))

             precision    recall  f1-score   support

          0       0.82      0.86      0.84       132
          1       0.76      0.70      0.73        83

avg / total       0.80      0.80      0.80       215



In [None]:
#Random forest gave a slightly higher accurary with an average of the total 80% 
#compared to the other two models with an aveage of 77%

# Boosting Trees
Apply Boosting Tree cassifier and calculate the aaccuracy

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
dtreeBoost = GradientBoostingClassifier()

In [33]:
dtreeBoost.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [34]:
predictionsBoost = dtreeBoost.predict(X_test)

In [35]:
print(classification_report(y_test,predictionsBoost))

             precision    recall  f1-score   support

          0       0.81      0.89      0.85       132
          1       0.79      0.67      0.73        83

avg / total       0.80      0.80      0.80       215



In [None]:
#in this case boosting did not have any significant change in the prediction rate with the percision averaging at 80% 