In [17]:
import numpy as np
import pandas as pd
import sklearn.cross_validation
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display, HTML

%matplotlib inline

In [18]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv', index_col='id')

display(train.head(5))

# Break the training data into a target ("dependent") and inputs ("inpedendents")
y_train = train.ACTION
X_train = train.drop(["ACTION"], axis=1)
print("Number of instances:{}".format(len(train.index)))

from collections import Counter

def tally_predictions(predictions):
    count = Counter()
    for pred in predictions:
        if pred == 1:
            count[1] += 1
        else:
            count[0] += 1
    print(count[0])
    print(count[1])

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


Number of instances:32769


In [19]:
# Drop unnecessary features
X_train = X_train.drop(["ROLE_CODE"], axis=1)
test = test.drop(["ROLE_CODE"], axis=1)

In [20]:
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

X_t, X_val, y_t, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
y_val = np.array(y_val)

In [10]:
import xgboost as xgb

# Test out a simple XGBClassifier()
xgmodel = xgb.XGBClassifier(max_depth=10, n_estimators=2000)
xgmodel.fit(X_t, y_t)
preds = xgmodel.predict_proba(X_val)[:, 1]

print(preds)
roc_auc_score(y_val, preds)

[ 0.99938524  0.9999845   0.99974126 ...,  0.99933136  0.99996018
  0.99939835]


0.84019967107364113

In [21]:
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV

In [22]:
# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.3],
    'n_estimators': [155],
    'min_child_weight': [0.6],
    'colsample_bytree': [0.45],
    'subsample': [1.0]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'learning_rate': 0.3, 'n_estimators': 155, 'min_child_weight': 0.6, 'max_depth': 8, 'subsample': 1.0, 'colsample_bytree': 0.45}

0.863 (+/-0.029) for {'learning_rate': 0.3, 'n_estimators': 155, 'min_child_weight': 0.6, 'max_depth': 8, 'subsample': 1.0, 'colsample_bytree': 0.45}



In [23]:
preds = xg_clf.predict_proba(test)[:, 1]

# Spit out predictions to a file
pred_write = enumerate(preds, start=1)
with open('output/xgb_155trees_minchildweight.6_colsampletree.45_learningrate0.3_maxdepth8_.86815.csv', 'w') as f:
    f.write('Id,Action\n')
    for instance, prediction in pred_write:
        f.write('{},{}\n'.format(instance, prediction))

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Try random forest
rf = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_features='auto', bootstrap=True)
score = cross_val_score(rf, X=X_train, y=np.array(y_train), scoring='roc_auc', cv=10, n_jobs=-1)
print(score)
print(score.mean())

rf.fit(X_train, y_train)

[ 0.85855945  0.85386471  0.89336266  0.86917975  0.8627291   0.86269671
  0.85022676  0.8687164   0.88585175  0.85088089]
0.865606816459


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
predictions = rf.predict_proba(test)[:, 1]

# Spit out predictions to a file
pred_write = enumerate(predictions, start=1)
with open('output/rf_2ktrees_entropy_auto_bootstrapped_.863.csv', 'w') as f:
    f.write('Id,Action\n')
    for instance, prediction in pred_write:
        f.write('{},{}\n'.format(instance, prediction))

In [26]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

encoder = OneHotEncoder()
encoder.fit(np.vstack((X_train, test)))
encoded_X_train = encoder.transform(X_train)  # Returns a sparse matrix (see numpy.sparse)
encoded_test = encoder.transform(test)

In [27]:
from sklearn.svm import SVC

# SVM
svm = SVC()
score = cross_val_score(svm, X=encoded_X_train, y=np.array(y_train), scoring='roc_auc', cv=10, n_jobs=-1)
print(score)
print(score.mean())

svm.fit(encoded_X_train, y_train)

[ 0.78081453  0.77015612  0.77309515  0.75117812  0.73461545  0.7074003
  0.73489677  0.7131545   0.77324434  0.77506029]
0.751361556939


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
svm = SVC(C=2.0, kernel='rbf', gamma=0.1)
score = cross_val_score(svm, X=encoded_X_train, y=np.array(y_train), scoring='roc_auc', cv=10, n_jobs=-1)
print(score)
print(score.mean())

svm.fit(encoded_X_train, y_train)

KeyboardInterrupt: 

In [30]:
# Grid search XGB
parameters = {
    'C': [1.5],
    'kernel': ['rbf', 'linear', 'poly'],
    'gamma': ['auto', 0.5, 1.0],
    'probability': [True]
}

svm_clf = RandomizedSearchCV(SVC(), parameters, n_iter=5, cv=10, n_jobs=-1, scoring='roc_auc')
svm_clf.fit(encoded_X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(svm_clf.best_params_)
print()
for params, mean_score, scores in svm_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

KeyboardInterrupt: 