In [1]:
import csv
import numpy as np
import pandas as pd
import random

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv('data.csv', sep=",")
test_data = pd.read_csv('quiz.csv', sep=",")

train_data = train.iloc[:,:-1]
train_labels = train.iloc[:,-1]

train_data, holdout_data, train_labels, holdout_label = train_test_split(train_data, train_labels, 
                                                                         test_size=0.2, random_state=1)
all_data = pd.concat([train_data, test_data])
train_obs = len(train_data)
test_obs = len(test_data)

# Change dtype of categorical columns
categorical_columns = ['0','5','7','8','9','14','16','17','18','20','23','25','26','56','57','58']
for i in range(0,len(categorical_columns)):
    all_data[categorical_columns[i]] = all_data[categorical_columns[i]].astype('category')
    
# Only numerical data
print('Processing numerical data...')
all_data_num = all_data.drop(categorical_columns, axis=1)
train_data_num = all_data_num.iloc[0:train_obs,]
test_data_num = all_data_num.iloc[train_obs:,]

# Only categorical data
print('Processing categorical data...')
all_data_cat = pd.get_dummies(all_data[categorical_columns])
train_data_cat = all_data_cat.iloc[0:train_obs,]
test_data_cat = all_data_cat.iloc[train_obs:,]

# Combine the two
print('Combining data...')
train_data_combo = pd.concat([train_data_num, train_data_cat], axis=1)
test_data_combo = pd.concat([test_data_num, test_data_cat], axis=1)

print('Processing finished!')

Processing numerical data...
Processing categorical data...


KeyboardInterrupt: 

In [19]:
params = {'penalty': ['l1','l2'],
          'C': sp_uniform(0,1),
          'n_jobs': [-1]
}
classifier = LogisticRegression()
search_results = RandomizedSearchCV(estimator = classifier,
                                        param_distributions = params,
                                        n_iter = 10, n_jobs = 1,
                                        cv = 3, verbose = 3 )
search_results.fit(train_data_combo, train_labels)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] penalty=l2, C=0.505552636179, n_jobs=-1 .........................
[CV]  penalty=l2, C=0.505552636179, n_jobs=-1, score=0.890581 -  25.6s
[CV] penalty=l2, C=0.505552636179, n_jobs=-1 .........................
[CV]  penalty=l2, C=0.505552636179, n_jobs=-1, score=0.890962 -  24.7s
[CV] penalty=l2, C=0.505552636179, n_jobs=-1 .........................
[CV]  penalty=l2, C=0.505552636179, n_jobs=-1, score=0.895364 -  25.6s
[CV] penalty=l1, C=0.902967739143, n_jobs=-1 .........................
[CV]  penalty=l1, C=0.902967739143, n_jobs=-1, score=0.891083 -  32.7s
[CV] penalty=l1, C=0.902967739143, n_jobs=-1 .........................
[CV]  penalty=l1, C=0.902967739143, n_jobs=-1, score=0.891021 -  35.1s
[CV] penalty=l1, C=0.902967739143, n_jobs=-1 .........................
[CV]  penalty=l1, C=0.902967739143, n_jobs=-1, score=0.896014 -  29.9s
[CV] penalty=l2, C=0.595889106942, n_jobs=-1 .........................
[CV]  penalty=l2

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 15.3min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'penalty': ['l1', 'l2'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x106fe2190>, 'n_jobs': [-1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=3)

In [20]:
search_results.best_score_

0.892706146704905

In [21]:
search_results.grid_scores_

[mean: 0.89230, std: 0.00217, params: {'penalty': 'l2', 'C': 0.5055526361793926, 'n_jobs': -1},
 mean: 0.89271, std: 0.00234, params: {'penalty': 'l1', 'C': 0.9029677391429398, 'n_jobs': -1},
 mean: 0.89233, std: 0.00222, params: {'penalty': 'l2', 'C': 0.595889106941752, 'n_jobs': -1},
 mean: 0.89107, std: 0.00182, params: {'penalty': 'l1', 'C': 0.2882093450562242, 'n_jobs': -1},
 mean: 0.89245, std: 0.00211, params: {'penalty': 'l1', 'C': 0.6788856401352185, 'n_jobs': -1},
 mean: 0.88665, std: 0.00193, params: {'penalty': 'l1', 'C': 0.09133627165533109, 'n_jobs': -1},
 mean: 0.89234, std: 0.00223, params: {'penalty': 'l2', 'C': 0.39636430045354476, 'n_jobs': -1},
 mean: 0.88778, std: 0.00209, params: {'penalty': 'l1', 'C': 0.115057282439877, 'n_jobs': -1},
 mean: 0.89239, std: 0.00212, params: {'penalty': 'l1', 'C': 0.7280559476243866, 'n_jobs': -1},
 mean: 0.89267, std: 0.00228, params: {'penalty': 'l1', 'C': 0.896475755924302, 'n_jobs': -1}]