# ML Kaggle Competition

In [1]:
import csv
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import KFold

train = pd.read_csv('data.csv', sep=",")
train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

test_data = pd.read_csv('quiz.csv', sep=",")

all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')

In [7]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVR
from xgboost import XGBClassifier

from sklearn.tree import ExtraTreeClassifier

def pred_and_error(model, test_data, test_labels):
    preds = model.predict(test_data)
    error = 1 - sum(preds == test_labels)/float(len(test_labels))
    return preds, error

def cv_run_ada(train_data, train_labels, test_data, test_labels):
    model = AdaBoostClassifier().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_bag(train_data, train_labels, test_data, test_labels):
    model = BaggingClassifier(max_features=0.394512412319, n_estimators=435,
                              random_state=1, n_jobs=-1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_et(train_data, train_labels, test_data, test_labels):
    model = ExtraTreesClassifier(n_jobs=-1, min_samples_leaf=2, n_estimators=99,
                                 min_samples_split=3, random_state=1,
                                 max_features=1611, max_depth=None).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_gb(train_data, train_labels, test_data, test_labels):
    model = GradientBoostingClassifier(loss='exponential', n_estimators=200, 
                                       max_features=None, random_state=1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_knn(train_data, train_labels, test_data, test_labels, n_neigh):
    model = KNeighborsClassifier(n_jobs=-1, n_neighbors=n_neigh).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_logistic(train_data, train_labels, test_data, test_labels):
    model = LogisticRegression(penalty='l1', C=0.9029677391429398,
                               n_jobs=-1, random_state=1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_nb(train_data, train_labels, test_data, test_labels):
    model = GaussianNB().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_neural(train_data, train_labels, test_data, test_labels):
    model = MLPClassifier(hidden_layer_sizes=(1000,)).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_rf(train_data, train_labels, test_data, test_labels):
    model = RandomForestClassifier(n_jobs=-1, random_state=1).fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_sgd(train_data, train_labels, test_data, test_labels):    
    model = SGDClassifier(loss='perceptron').fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_svm(train_data, train_labels, test_data, test_labels):    
    model = SVR().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

def cv_run_xg(train_data, train_labels, test_data, test_labels):
    model = XGBClassifier().fit(train_data, train_labels)
    return pred_and_error(model, test_data, test_labels)

## Simple prediction with only numerical variables

Quick benchmarking predictions only using the numerical variables.

In [3]:
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

In [4]:
train_data_num['div'] = (train_data_num.loc[:,'60'] / train_data_num.loc[:,'59'])
train_data_num.loc[:,'div'] = train_data_num.loc[:,'div'].fillna(0)

test_data_num['div'] = (test_data_num.loc[:,'60'] / test_data_num.loc[:,'59'])
test_data_num.loc[:,'div'] = test_data_num.loc[:,'div'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [39]:
# from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

# min_max_scaler = MinMaxScaler()
# train_data_num = pd.DataFrame(min_max_scaler.fit_transform(train_data_num))

# knn_classifier = KNeighborsClassifier(n_neighbors=1, n_jobs=-1).fit(train_data_num, train_labels)
# dists1 = knn_classifier.kneighbors(train_data_num, n_neighbors=1)
# dists1 = [i[0] for i in dists1[1]]

# dists2 = knn_classifier.kneighbors(train_data_num, n_neighbors=2)
# dists2 = [i+j for (i,j) in dists2[1]]

# dists4 = knn_classifier.kneighbors(train_data_num, n_neighbors=4)
# dists4 = [sum(i) for i in dists4[1]]

# Feature engineering

# train_data_num = pd.concat((train_data_num,
#                             np.sum(train_data_num, axis=1),
#                             np.sum(train_data_num != 0, axis=1),
#                             pd.Series(dists1), pd.Series(dists2), pd.Series(dists4)), axis=1)
# train_data_num = pd.concat((train_data_num,
#                             np.log(train_data_num.loc[:,'59']+1),
#                             np.log(train_data_num.loc[:,'60']+1)), axis=1)
# train_data_num = pd.concat((train_data_num, np.sum(train_data_num, axis=1)), axis=1)
# poly = PolynomialFeatures(2)
# train_data_num = pd.DataFrame(poly.fit_transform(train_data_num))

In [81]:
n_folds = 5
kf = KFold(n_folds=n_folds, shuffle=True, random_state=1)
cv_errors = []
for i, (train, test) in enumerate(kf.split(train_data_num)):
    cv_train_data = train_data_num.iloc[train,:]
    cv_train_labels = train_labels[train]
    cv_test_data = train_data_num.iloc[test,:]
    cv_test_labels = train_labels[test]
   
    print("Starting fold #{}".format(i+1))
    _, fold_error = cv_run_neural(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print(" Fold error (#{}): {}".format(i+1, fold_error))
    cv_errors.append(fold_error)

print(sum(cv_errors)/float(n_folds))

Starting fold #1
 Fold error (#1): 0.2408152002522863
Starting fold #2
 Fold error (#2): 0.2505124566382845
Starting fold #3
 Fold error (#3): 0.2454369850593291
Starting fold #4
 Fold error (#4): 0.2454369850593291
Starting fold #5
 Fold error (#5): 0.24456971656088622
0.245354268714


In [None]:
model = AdaBoostClassifier().fit(train_data_num, train_labels)
preds = model.predict(test_data_num)

# with open('simple_results.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerow(("Id","Prediction"))
#     writer.writerows(zip(range(1,len(preds)+1), preds))

## Incorporating Categorical Variable with One Hot Encoding

Combine training and testing data so that the encoder sees every category, the split again once the columns are ready.

In [5]:
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

In [None]:
random.seed(1)
n_folds = 5
num_train = len(train_data_combo)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, n_folds)

cv_errors = []
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_combo.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_combo.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
    _, fold_error = cv_run_bag(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print(" Fold error (#{}): {}".format(i+1, fold_error))
    cv_errors.append(fold_error)

print(sum(cv_errors)/float(n_folds))

In [None]:
model = RandomForestClassifier(n_jobs=-1, min_samples_leaf=1, n_estimators=77,
                                   min_samples_split=2, random_state=1, max_features=771,
                                   max_depth=None).fit(train_data_combo, train_labels)
preds = model.predict(test_data_combo)

with open('results/20160420(rf).csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(("Id","Prediction"))
    writer.writerows(zip(range(1,len(preds)+1), preds))

## Ignoring two large columns

Columns 23 and 58 contibute 3,031 and 2,090 categories/columns (total 5,121). Removing them should make training a lot easier for some methods.

In [6]:
# all_data['23'].cat.categories
# all_data['58'].cat.categories
categorical_columns_tiny = ['0','5','7','8','9','14','16','17','18','20','25','26','56','57']

all_data_cat_tiny = pd.get_dummies(all_data[categorical_columns_tiny])
train_data_cat_tiny = all_data_cat_tiny.iloc[0:train_obs,]
test_data_cat_tiny = all_data_cat_tiny.iloc[train_obs:,]

train_data_combo_tiny = pd.concat([train_data_num, train_data_cat_tiny], axis=1)
test_data_combo_tiny = pd.concat([test_data_num, test_data_cat_tiny], axis=1)

In [None]:
random.seed(1)
n_folds = 5
num_train = len(train_data_combo_tiny)
indices = random.sample(range(num_train), num_train)
cv_folds = np.array_split(indices, n_folds)

cv_errors = []
for i, fold in enumerate(cv_folds):
    cv_train_index = np.setxor1d(indices, fold)
    cv_test_index = fold
    
    cv_train_data = train_data_combo_tiny.iloc[cv_train_index,:]
    cv_train_labels = train_labels[cv_train_index]

    cv_test_data = train_data_combo_tiny.iloc[cv_test_index,:]
    cv_test_labels = train_labels[cv_test_index]
    
    print("Starting fold #{}".format(i+1))
    _, fold_error = cv_run_neural(cv_train_data, cv_train_labels, cv_test_data, cv_test_labels)
    print(" Fold error (#{}): {}".format(i+1, fold_error))
    cv_errors.append(fold_error)

print(sum(cv_errors)/float(n_folds))