In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline
from scipy import stats,sparse
from sklearn.base import TransformerMixin
from datetime import datetime as dt
from math import isnan
from numpy import ma
import cPickle as pickle
import xgboost as xgb
import time
from ModelClassifier import XGBoostClassifier,PAClassifier,SGDSVMClassifier

In [2]:
import json
from sklearn.metrics import roc_curve, auc
from re import sub
from collections import defaultdict

In [3]:
from sklearn.cross_validation import StratifiedKFold,cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

###Read in basic numerical features

In [4]:
num_basic = np.load('data/nxtrain_standard_original0.npy')

In [5]:
num_basic.shape

(145231, 1448)

### Categorical features 

In [6]:
with open('data/cat_sparse_th60_train2.dat', 'rb') as infile:
       cat_sparse = pickle.load(infile)

In [7]:
cat_sparse.shape

(145231, 211)

In [12]:
cat_mt = cat_sparse.toarray()

In [8]:
with open('data/cat_numeric_th60_train2.dat', 'rb') as infile:
    cat_ordinal = pickle.load(infile)

In [11]:
ordi_mt = cat_ordinal.as_matrix()

###Time series features 

In [14]:
with open('data/time_series_derived_train2.dat','rb') as infile:
    time_df = pickle.load(infile)

In [15]:
time_mt = time_df.as_matrix()

#####So far, imported features are time_mt, cat_mt, num_basic and ordi_mt

In [16]:
xtrain = np.hstack([cat_mt,time_mt,num_basic,ordi_mt])

In [17]:
xtrain.shape

(145231, 1942)

###Read in target variable 

In [18]:
with open('data/ytrain2.dat','rb') as infile:
    ytraindf = pickle.load(infile)

In [19]:
ytrain = ytraindf.as_matrix().flatten()

In [20]:
ytrain.shape

(145231,)

###Generate one tenth of the data set to test speed 

In [21]:
np.random.seed(123)

In [22]:
indices = np.random.choice(ytrain.shape[0],size = ytrain.shape[0]/10, replace = False)

In [23]:
ytrain_small = ytrain[indices]

In [24]:
xtrain_small = xtrain[indices,:]

In [25]:
xtrain_small.shape

(14523, 1942)

##Helper functions to test time complexity 

In [30]:
def testtime(model):
    """
    test time complexity based on one tenth of the training data
    """
    starttime = time.time()
    model.fit(xtrain_small,ytrain_small)
    print 'execution takes {}'.format(time.time()-starttime)

## 1. SGD: Online Logistic Regression

In [26]:
from sklearn.linear_model import SGDClassifier

In [27]:
clf_small = SGDClassifier(loss = 'log')

In [31]:
testtime(clf_small)

execution takes 0.268999099731


###Grid search on the complete training dataset 

In [32]:
param_sgd = {'alpha':np.logspace(-5,-3,3),'l1_ratio':[0,1]}

In [33]:
clf_sgd = SGDClassifier(loss = 'log',penalty = 'elasticnet',random_state=123)

In [34]:
gs_sgd = GridSearchCV(clf_sgd,param_grid = param_sgd,cv = StratifiedKFold(ytrain,n_folds = 3),scoring='roc_auc',
                     n_jobs = 2,verbose = 1)

In [35]:
gs_sgd.fit(xtrain,ytrain)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=2)]: Done   1 jobs       | elapsed:   38.2s
[Parallel(n_jobs=2)]: Done  16 out of  18 | elapsed:  2.3min remaining:   17.1s
[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed:  2.4min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 1 0], n_folds=3, shuffle=False, random_state=None),
       error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=123, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=2,
       param_grid={'alpha': array([  1.00000e-05,   1.00000e-04,   1.00000e-03]), 'l1_ratio': [0, 1]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=1)

In [36]:
print 'Best Score is {}'.format(gs_sgd.best_score_)
print 'Best parameters set:'
best_parameters = gs_sgd.best_estimator_.get_params()
for param_name in sorted(param_sgd.keys()):
    print '\t%s: %r' % (param_name,best_parameters[param_name])

Best Score is 0.652888625839
Best parameters set:
	alpha: 0.0001
	l1_ratio: 1


In [40]:
sgd_opt = gs_sgd.best_estimator_

###dump the model to pickle 

In [47]:
with open('sgd_opt.pkl', 'wb') as fid:
    pickle.dump(sgd_opt, fid,protocol = 2)

###read in the model again 

In [48]:
with open('sgd_opt.pkl','rb') as fid:
    test = pickle.load(fid)