# Stacking

In [1]:
import csv
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import KFold

## Data Processing

In [2]:
#########################
# Load the datasets
train = pd.read_csv('data.csv', sep=",")
test_data = pd.read_csv('quiz.csv', sep=",")
train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in xrange(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')

#########################
# Only numerical data
print('Processing numerical data...')
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

#########################
# Only categorial data
print('Processing categorical data...')
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

#########################
# Ignoring two large columns ('slim')
print('Processing categorical data (slim)...')
categorical_columns_slim = ['0','5','7','8','9','14','16','17','18','20','25','26','56','57']

all_data_cat_slim = pd.get_dummies(all_data[categorical_columns_slim])
train_data_cat_slim = all_data_cat_slim.iloc[0:train_obs,]
test_data_cat_slim = all_data_cat_slim.iloc[train_obs:,]

#########################
# Combined sets
print('Combining data...')
train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

train_data_combo_slim = pd.concat([train_data_num, train_data_cat_slim], axis=1)
test_data_combo_slim = pd.concat([test_data_num, test_data_cat_slim], axis=1)

#########################
# Clear memory
all_data, train, train_data, test_data = None, None, None, None
all_data_num, train_data_num, test_data_num = None, None, None
all_data_cat, train_data_cat, test_data_cat = None, None, None
all_data_cat_slim, train_data_cat_slim, test_data_cat_slim = None, None, None

print('Finished processing!')

Processing numerical data...
Processing categorical data...
Processing categorical data (slim)...
Combining data...
Finished processing!


In [3]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVR

from sklearn.tree import ExtraTreeClassifier

def pred_and_error(model, test_data, test_labels):
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_ada(train_data, train_labels, test_data, test_labels):
    model = AdaBoostClassifier().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_bag(train_data, train_labels, test_data, test_labels):
    model = BaggingClassifier(max_features=0.75,
                              n_estimators=20,
                              random_state=1, n_jobs=-1).fit(train_data, train_labels)
    preds = model.predict(test_data)
    return pred_and_error(model, test_data, test_labels)

def cv_run_et(train_data, train_labels, test_data, test_labels):
    model = ExtraTreesClassifier(min_samples_leaf=1, n_estimators=40,
                                 min_samples_split=3, random_state=1,
                                 max_features=1743, max_depth=None, n_jobs=-1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_et_meta(train_data, train_labels, test_data, test_labels):
    model = ExtraTreesClassifier(max_features=None,
                                 n_jobs=-1, random_state=1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_gb(train_data, train_labels, test_data, test_labels):
    model = GradientBoostingClassifier(loss='exponential', n_estimators=200, max_features=None, random_state=1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_knn(train_data, train_labels, test_data, test_labels, n_neigh):
    model = KNeighborsClassifier(n_neighbors = n_neigh, n_jobs=-1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_logistic(train_data, train_labels, test_data, test_labels):
    model = LogisticRegression(penalty='l1',
                               C=0.9029677391429398,
                               n_jobs=-1, random_state=1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_nb(train_data, train_labels, test_data, test_labels):
    model = GaussianNB().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_neural(train_data, train_labels, test_data, test_labels):
    model = MLPClassifier().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(n_jobs=-1, min_samples_leaf=1, n_estimators=71,
                                   min_samples_split=4, random_state=1, max_features=1148,
                                   max_depth=None).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_rf_meta(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(max_features=None,
                                   random_state=1, n_jobs=-1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_sgd(train_data, train_labels, test_data, test_labels):    
    model = SGDClassifier(loss='perceptron').fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_svm(train_data, train_labels, test_data, test_labels):    
    model = SVR().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

## Part One - Cross validation predictions and errors

In [4]:
kf = KFold(n_folds=5, shuffle=True, random_state=1)

cv_preds = []
indices = []
for i, (train, test) in enumerate(kf.split(train_data_combo)):
    # Collect the indices used for test sets we go
    indices = np.concatenate((indices, test))
    
    # Split into train and testing data/labels
    cv_train_data = train_data_combo.iloc[train,:]
    cv_train_data_slim = train_data_combo_slim.iloc[train,:]
    cv_train_labels = train_labels[train]
    
    cv_test_data = train_data_combo.iloc[test,:]
    cv_test_data_slim = train_data_combo_slim.iloc[test,:]
    cv_test_labels = train_labels[test]
    
    # CV predictions & errors for each classifier
    print("Starting fold #{}".format(i+1))
    preds_1, error_1 = cv_run_et(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_1))
    preds_2, error_2 = cv_run_rf(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_2))
    preds_3, error_3 = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_3))
    preds_4, error_4 = cv_run_logistic(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_4))
    preds_5, error_5 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 1)
    print("Error: {}".format(error_5))
    preds_6, error_6 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 2)
    print("Error: {}".format(error_6))
    preds_7, error_7 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 5)
    print("Error: {}".format(error_7))
    preds_8, error_8 = cv_run_knn(cv_train_data_slim, cv_train_labels, cv_test_data_slim, cv_test_labels, 10)
    print("Error: {}".format(error_8))
    
    # Collect all the fold predictions together, fold_length * 8
    fold_preds = np.column_stack((preds_1, preds_2, preds_3, preds_4, preds_5, preds_6, preds_7, preds_8))
    
    # Vertically stack the current fold predictions below the previous ones
    if len(cv_preds) == 0:
        cv_preds = fold_preds
    else:
        cv_preds = np.vstack((cv_preds, fold_preds))
        
    print('')

Starting fold #1
Error: 0.0544386628824
Error: 0.0538867865027


descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

Error: 0.0542415641753
Error: 0.105763166194
Error: 0.0829785556607
Error: 0.0808104698833
Error: 0.108207190161
Error: 0.124487543362

Starting fold #2
Error: 0.0539262062441
Error: 0.0538473667613


descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

Error: 0.0526253547777
Error: 0.107379375591
Error: 0.0829785556607
Error: 0.0823084200568
Error: 0.110257016714
Error: 0.121688741722

Starting fold #3
Error: 0.0551898135373
Error: 0.0555446051957


descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

Error: 0.0564118736942
Error: 0.111246895573
Error: 0.0830212480782
Error: 0.0818386092167
Error: 0.110221941893
Error: 0.127646154453

Starting fold #4
Error: 0.0557417116726
Error: 0.0538889107896


descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

Error: 0.0553080774234
Error: 0.107620136398
Error: 0.0830212480782
Error: 0.0816415027398
Error: 0.107738400284
Error: 0.124058816573

Starting fold #5
Error: 0.053928332085
Error: 0.0543225450388


descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

Error: 0.0555446051957
Error: 0.101312729136
Error: 0.0836914100997
Error: 0.0816020814444
Error: 0.106003863287
Error: 0.121851224031



descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

### Average CV errors for each classifier

In [5]:
cv_labels = train_labels[indices]
for i in xrange(cv_preds.shape[1]):
    print("Method #{}: {}".format(i, 1 - sum(cv_preds[:,i] == cv_labels)/float(len(cv_labels))))
cv_labels = cv_labels.as_matrix()

Method #0: 0.0546449379913
Method #1: 0.054298036062
Method #2: 0.0548262730907
Method #3: 0.106664459109
Method #4: 0.0831382009981
Method #5: 0.0816402153946
Method #6: 0.108485694237
Method #7: 0.123946482493


## Part Two - Combining fold predictions

In [13]:
cv_preds_stack = pd.DataFrame(cv_preds)

kf = KFold(n_folds=10, shuffle=True, random_state=1)

cv_errors = []
for i, (train, test) in enumerate(kf.split(cv_preds_stack)):
    cv_train_data = cv_preds_stack.iloc[train,:]
    cv_train_labels = cv_labels[train]
    cv_test_data = cv_preds_stack.iloc[test,:]
    cv_test_labels = cv_labels[test]
    
    print("Starting fold #{}".format(i+1))
    preds_0, error_0 = cv_run_ada(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_0))
    preds_1, error_1 = cv_run_et_meta(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_1))
    preds_2, error_2 = cv_run_rf_meta(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_2))
    preds_3, error_3 = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_3))
    preds_4, error_4 = cv_run_logistic(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print("Error: {}".format(error_4))
    preds_5, error_5 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 1)
    print("Error: {}".format(error_5))
    preds_6, error_6 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 2)
    print("Error: {}".format(error_6))
    preds_7, error_7 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 5)
    print("Error: {}".format(error_7))
    preds_8, error_8 = cv_run_knn(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels, 10)
    print("Error: {}\n".format(error_8))
    
    fold_errors = [error_0, error_1, error_2, error_3, error_4, error_5, error_6, error_7, error_8]
    cv_errors.append(fold_errors)
    
method_errors = pd.DataFrame(cv_errors).mean(axis=0)
for i, method_error in enumerate(method_errors):
    print("Error for method #{}: {}".format(i, method_error))
    
print('\nBest method is #{}: {}'.format(method_errors.idxmin(), method_errors[method_errors.idxmin()]))

Starting fold #1
Error: 0.0555818353832
Error: 0.052270577105
Error: 0.0521917376222
Error: 0.0518763796909
Error: 0.0536108483128
Error: 0.0553453169347
Error: 0.0554241564175
Error: 0.0532166508988
Error: 0.0536896877956

Starting fold #2
Error: 0.0562125512457
Error: 0.05100914538
Error: 0.0513245033113
Error: 0.0511668243456
Error: 0.0520340586566
Error: 0.0821507410911
Error: 0.0767896562599
Error: 0.0519552191738
Error: 0.0510879848628

Starting fold #3
Error: 0.0579470198675
Error: 0.0536108483128
Error: 0.0536108483128
Error: 0.0521917376222
Error: 0.0551087984863
Error: 0.0623620309051
Error: 0.0606275622832
Error: 0.0592872910754
Error: 0.054793440555

Starting fold #4
Error: 0.0601545253863
Error: 0.056922106591
Error: 0.0564490696941
Error: 0.0572374645222
Error: 0.0577893409019
Error: 0.108956165248
Error: 0.0808104698833
Error: 0.0639388205613
Error: 0.0571586250394

Starting fold #5
Error: 0.0555029959003
Error: 0.0502995900347
Error: 0.0500630715863
Error: 0.05006307158

## Train all models for export

In [7]:
print('Model 1')
model = ExtraTreesClassifier(n_jobs=-1, min_samples_leaf=1, n_estimators=40,
                                 min_samples_split=3, random_state=1,
                                 max_features=1743, max_depth=None).fit(train_data_combo, train_labels)
preds_1 = model.predict(test_data_combo)

print('Model 2')
model = RandomForestClassifier(n_jobs=-1, min_samples_leaf=1, n_estimators=71,
                                   min_samples_split=4, random_state=1, max_features=1148,
                                   max_depth=None).fit(train_data_combo, train_labels)
preds_2 = model.predict(test_data_combo)

print('Model 3')
model = BaggingClassifier(max_features=0.75,
                          n_estimators=20,
                          random_state=1, n_jobs=-1).fit(train_data_combo, train_labels)
preds_3 = model.predict(test_data_combo)

print('Model 4')
model = LogisticRegression(penalty='l1',
                               C=0.9029677391429398,
                               n_jobs=-1, random_state=1).fit(train_data_combo, train_labels)
preds_4 = model.predict(test_data_combo)

print('Model 5')
model = KNeighborsClassifier(n_neighbors=1, n_jobs=-1).fit(train_data_combo_slim, train_labels)
preds_5 = model.predict(test_data_combo_slim)

print('Model 6')
model = KNeighborsClassifier(n_neighbors=2, n_jobs=-1).fit(train_data_combo_slim, train_labels)
preds_6 = model.predict(test_data_combo_slim)

print('Model 7')
model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(train_data_combo_slim, train_labels)
preds_7 = model.predict(test_data_combo_slim)

print('Model 8')
model = KNeighborsClassifier(n_neighbors=10, n_jobs=-1).fit(train_data_combo_slim, train_labels)
preds_8 = model.predict(test_data_combo_slim)

preds = np.column_stack((preds_1, preds_2, preds_3, preds_4, preds_5, preds_6, preds_7, preds_8))

Model 1
Model 2
Model 3
Model 4
Model 5
Model 6
Model 7
Model 8


In [14]:
model = RandomForestClassifier(max_features=None,
                                   random_state=1, n_jobs=-1).fit(cv_preds_stack, cv_labels)
results = model.predict(preds)

In [15]:
with open('results/20160414-3(stack).csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(("Id","Prediction"))
    writer.writerows(zip(range(1,len(results)+1), results))