# Ensemble

In [169]:
import pandas as pd
import glob
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 
from sklearn import cross_validation, grid_search
from math import sqrt

## Prep Prediction Files

I was trying to find a graceful way to easily handle all the different submission files that everyone was sending me because I need to combine them all into master file. The code below essentially just opens each file that each person sends me and adds a column to their submission with their intials + unique index num. I later pivot on this column so that each prediction will be in its own column.

In [170]:
team = ["vv", "ar", "sj", "ns", "js"]
# Training
for m in team:
    path = "ensemble/training/%s_*.csv" % m
    for i, f in enumerate(glob.glob(path)):
        df = pd.read_csv(f, header=0)

        if "member" in df.columns:
            df.drop('member', axis=1, inplace=True)
            
        df["member"] = m+str(i)
        df.to_csv(f, index=False)

In [171]:
# Testing
for m in team:
    path = "ensemble/testing/%s_*.csv" % m
    for i, f in enumerate(glob.glob(path)):
        df = pd.read_csv(f, header=0)

        if "member" in df.columns:
            df.drop('member', axis=1, inplace=True)
            
        df["member"] = m+str(i)
        df.to_csv(f, index=False)

## Simple Bagging Code

Take majority vote with equal weight to each submission.

In [172]:
ensemble = pd.DataFrame()
for path in glob.glob('ensemble/testing/*.csv'):
    df = pd.read_csv(path)
    ensemble = ensemble.append(df, ignore_index=True)

In [173]:
prediction = ensemble.pivot("passenger_id", "member", "survived")
prediction = prediction.mode(axis=1, numeric_only=True)
prediction = prediction[0].reset_index()
prediction.columns = ["passenger_id", "survived"]
print prediction.sum()
prediction.astype(int).to_csv("results/bagging_ensemble_1.csv", index=False)

passenger_id    548366
survived           198
dtype: int64


## Fancy Ensembling

### Random Forests

In [138]:
ensemble_X = pd.DataFrame()
for path in glob.glob('ensemble/training/*.csv'):
    df = pd.read_csv(path)
    ensemble_X = ensemble_X.append(df, ignore_index=True)

In [139]:
X_features = ensemble_X.pivot("passenger_id", "member", "survived")
training_data = pd.read_csv("train.csv")
Y = training_data["survived"]
X_features.sum()

member
ar0       254
js0       244
ns0       245
sj0       252
vv0       256
dtype: float64

In [140]:
forest = RandomForestClassifier()
tuned_parameters = [{'max_features': ['sqrt', 'log2'], 'n_estimators': [100, 200, 500, 1000]}]
rf = grid_search.GridSearchCV(forest, tuned_parameters, cv=5).fit(X_features, Y)

test_predictions = rf.predict(X_features)

print test_predictions.shape
print rf.best_estimator_

(785,)
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)


#### Predict on the Testing Set

In [142]:
ensemble_testing_X = pd.DataFrame()
for path in glob.glob('ensemble/testing/*.csv'):
    df = pd.read_csv(path)
    ensemble_testing_X = ensemble_testing_X.append(df, ignore_index=True)

X_testing_features = ensemble_testing_X.pivot("passenger_id", "member", "survived")
X_testing_features.sum()

member
ar0       190
js0       185
ns0       169
sj0       209
vv0       207
dtype: int64

In [143]:
# Take the same decision trees and run it on the test data
test_predictions = rf.predict(X_testing_features)

In [144]:
testing_data = pd.read_csv("test.csv")
final_predictions = zip(testing_data["passenger_id"],test_predictions.astype(int))
output_columns = "passenger_id", "survived"
final_predictions = pd.DataFrame(final_predictions, columns=output_columns)
print final_predictions.sum()
print len(final_predictions)

passenger_id    548366
survived           175
dtype: int64
524


#### ### Remember to change the file name with each iteration or risk losing a good submission!

In [145]:
final_predictions.to_csv("results/2_ensemble_random_forest.csv", index=False)

### Adaboost

In [149]:
clf = AdaBoostClassifier(n_estimators=1000)
clf = clf.fit(X_features, Y)
test_predictions = clf.predict(X_testing_features)

In [150]:
testing_data = pd.read_csv("test.csv")
final_predictions = zip(testing_data["passenger_id"],test_predictions.astype(int))
output_columns = "passenger_id", "survived"
final_predictions = pd.DataFrame(final_predictions, columns=output_columns)
print final_predictions.sum()
print len(final_predictions)

passenger_id    548366
survived           172
dtype: int64
524


#### ### Remember to change the file name with each iteration or risk losing a good submission!

In [148]:
final_predictions.to_csv("results/2_ensemble_adaboost.csv", index=False)