# Ensemble/Voting Classification in Python with Scikit-Learn
ref：https://www.kaggle.com/c/titanic/submit

In [133]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier

In [134]:
training_data = pd.read_csv("data/train.csv")
testing_data = pd.read_csv("data/test.csv")
def get_nulls(training, testing):
    print("Training Data:")
    print(pd.isnull(training).sum())
    print("Testing Data:")
    print(pd.isnull(testing).sum())

get_nulls(training_data, testing_data)

Training Data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Testing Data:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [135]:
# Drop the cabin column, as there are too many missing values
# Drop the ticket numbers too, as there are too many categories
# Drop names as they won't really help predict survivors

training_data.drop(labels=["Cabin","Ticket","Name"],axis=1,inplace=True)
testing_data.drop(labels=["Cabin","Ticket","Name"],axis=1,inplace=True)

# Taking the mean/average value would be impacted by the skew
# so we should use the median value to impute missing values
training_data["Age"].fillna(training_data["Age"].median(),inplace=True)
testing_data["Age"].fillna(training_data["Age"].median(),inplace=True)
training_data["Embarked"].fillna("S",inplace=True)
testing_data["Fare"].fillna(training_data["Fare"].median(),inplace=True)


get_nulls(training_data, testing_data)

Training Data:
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
Testing Data:
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [136]:
# Fit the encoder on the data (Feature: Sex)
encoder_1 = LabelEncoder()
encoder_1.fit(training_data["Sex"])

# Transform and replace training data
training_sex_encoded = encoder_1.transform(training_data["Sex"])
training_data["Sex"] = training_sex_encoded
test_sex_encoded = encoder_1.transform(testing_data["Sex"])
testing_data["Sex"] = test_sex_encoded

# Fit the encoder on the data (Feature: Embarked)
encoder_2 = LabelEncoder()
encoder_2.fit(training_data["Embarked"])

training_Embarked_encoded = encoder_2.transform(training_data["Embarked"])
training_data["Embarked"] = training_Embarked_encoded
test_Embarked_encoded = encoder_2.transform(testing_data["Embarked"])
testing_data["Embarked"] = test_Embarked_encoded

# Any value we want to reshape needs be turned into array first
ages_train = np.array(training_data["Age"]).reshape(-1, 1)
ages_test = np.array(testing_data["Age"]).reshape(-1, 1)
fares_train = np.array(training_data["Fare"]).reshape(-1, 1)
fares_test = np.array(testing_data["Fare"]).reshape(-1, 1)


# Scaler takes arrays
scaler = StandardScaler()

training_data["Age"] = scaler.fit_transform(ages_train)
testing_data["Age"] = scaler.fit_transform(ages_test)
training_data["Fare"] = scaler.fit_transform(fares_train)
testing_data["Fare"] = scaler.fit_transform(fares_test)


In [166]:
# Now to select our training/testing data
X_features = training_data.drop(labels=['PassengerId', 'Survived'], axis=1)
y_labels = training_data['Survived']

print(X_features.head(5))
print(y_labels.head(5))

# Make the train/test data from validation

X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.1,random_state=85)

   Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
0       3    1 -0.565736      1      0 -0.502445         2
1       1    0  0.663861      1      0  0.786845         0
2       3    0 -0.258337      0      0 -0.488854         2
3       1    0  0.433312      1      0  0.420730         2
4       3    1  0.433312      0      0 -0.486337         2
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


## Simple Averaging Approach

In [167]:
LogReg_clf = LogisticRegression()
DTree_clf = DecisionTreeClassifier()
SVC_clf = SVC()

LogReg_clf.fit(X_train, y_train)
DTree_clf.fit(X_train, y_train)
SVC_clf.fit(X_train, y_train)

LogReg_pred = LogReg_clf.predict(X_val)
DTree_pred = DTree_clf.predict(X_val)
SVC_pred = SVC_clf.predict(X_val)

averaged_preds = (LogReg_pred + DTree_pred + SVC_pred)//3
acc = accuracy_score(y_val, averaged_preds)
print(acc)

0.8555555555555555


## Bagging Classification Example

In [168]:
logreg_bagging_model = BaggingClassifier(base_estimator=LogReg_clf, n_estimators = 80, random_state=2)
dtree_bagging_model = BaggingClassifier(base_estimator=DTree_clf, n_estimators = 80, random_state=2)
random_forest = RandomForestClassifier(n_estimators = 100, random_state=150)
extra_trees = ExtraTreesClassifier(n_estimators = 100, random_state=150)

def bagging_ensemble(model):
    k_folds = KFold(n_splits=80, random_state=2,shuffle=True)
    results = cross_val_score(model, X_train, y_train, cv=k_folds)
    return results.mean()  #改用return 回傳數值給下面Print

# bagging_ensemble(logreg_bagging_model)
print(f'logreg = {bagging_ensemble(logreg_bagging_model)}')
# bagging_ensemble(dtree_bagging_model)
print(f'dtree = {bagging_ensemble(dtree_bagging_model)}')
# bagging_ensemble(random_forest)
print(f'random_forest = {bagging_ensemble(random_forest)}')
# bagging_ensemble(extra_trees)
print(f'extra_trees = {bagging_ensemble(extra_trees)}')


logreg = 0.7902272727272728
dtree = 0.8140909090909091
random_forest = 0.8054545454545454
extra_trees = 0.7918181818181818


## Boosting Classification Example

In [169]:
k_folds = KFold(n_splits=20, random_state=12,shuffle=True)
num_estimators = [40, 45, 50, 55, 60]

for i in num_estimators:
    ada_boost = AdaBoostClassifier(n_estimators = i, random_state=0)
    results = cross_val_score(ada_boost,X_train, y_train , cv=k_folds)
    print("Results for {} estimators:".format(i))
    print(results.mean())


Results for 40 estimators:
0.8077439024390243
Results for 45 estimators:
0.8102439024390243
Results for 50 estimators:
0.8089939024390244
Results for 55 estimators:
0.8077134146341465
Results for 60 estimators:
0.8077134146341463


## voting\Stacking Classification Example

In [170]:
voting_clf = VotingClassifier(estimators=[('SVC', SVC_clf), ('DTree', DTree_clf), ('LogReg', LogReg_clf)], voting='hard')
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_val)
acc = accuracy_score(y_val, preds)
l_loss = log_loss(y_val, preds)
f1 = f1_score(y_val, preds)


print("Accuracy is: " + str(acc))
print("Log Loss is: " + str(l_loss))
print("F1 Score is: " + str(f1))

Accuracy is: 0.8666666666666667
Log Loss is: 4.605223492483438
F1 Score is: 0.7857142857142857


In [171]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
884,3,1,-0.335187,0,0,-0.506472,2
57,3,1,-0.066212,0,0,-0.502864,0
78,2,1,-2.192648,0,2,-0.064516,2
856,1,0,1.201810,1,1,2.671118,2
265,2,1,0.510161,0,0,-0.437007,2
...,...,...,...,...,...,...,...
564,3,0,-0.104637,0,0,-0.486337,2
563,3,1,-0.104637,0,0,-0.486337,2
708,1,0,-0.565736,0,0,2.402990,2
160,3,1,1.124960,0,1,-0.324253,2


In [172]:
testing_data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,0.371062,0,0,-0.497413,1
1,893,3,0,1.358985,1,0,-0.512278,2
2,894,2,1,2.544493,0,0,-0.464100,1
3,895,3,1,-0.221692,0,0,-0.482475,2
4,896,3,0,-0.616861,1,1,-0.417492,2
...,...,...,...,...,...,...,...,...
413,1305,3,1,-0.142658,0,0,-0.493455,2
414,1306,1,0,0.726714,0,0,1.314435,0
415,1307,3,1,0.687197,0,0,-0.507796,2
416,1308,3,1,-0.142658,0,0,-0.493455,2


In [173]:
test = testing_data.drop("PassengerId",axis=1,)
test.shape

(418, 7)

In [174]:
submission_df = pd.DataFrame(columns = ["PassengerId","Survived"])
submission_df["PassengerId"] = testing_data["PassengerId"]
preds = voting_clf.predict(test)
submission_df["Survived"] = preds
print(preds)
submission_df.to_csv("submission.csv",header = True, index = False)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
