#Generates Results cata for various solvers

In [1]:
%matplotlib inline
from pprint import pprint
from collections import defaultdict

import numpy as np
import math
import matplotlib.pyplot as plt
import warnings
import cPickle as pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import sklearn.ensemble
from sklearn.naive_bayes import GaussianNB
#from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

from sklearn.grid_search import GridSearchCV
from sklearn import metrics

from sklearn.learning_curve import learning_curve
from sklearn.learning_curve import validation_curve

import sys
sys.path.append('../dev')
from ml_plot import plot_validation_curve
from ml_plot import PredictCV, my_plot_learning_curve
from ml_plot import plot_prediction_curve
from ml_plot import get_dataset, eval_predictions

from ml_plot import getClassifierProbs
from ml_plot import plotThresholdDistribuition, plotPredictionStats
from ml_plot import plotCombinedResults

from ml_plot import PredictCV_TrainTest

In [2]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_regression, f_classif

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Configuration

In [4]:
PROJECT = 'nova'
# PROJECT = 'swift'
# PROJECT = 'cinder'
# PROJECT = 'heat'
# PROJECT = 'glance'

# IMPORTANCE = 'crit'
# IMPORTANCE = 'high+'
IMPORTANCE = 'med+'
# IMPORTANCE = 'low+'

# SIZE = 100
#SIZE = 250
# SIZE = 0.1
SIZE = 0.5

# SCORING = 'f1'         # (precision * recall) / (precision + recall)
# SCORING = 'accuracy'   # (TP + TN) / all values
# SCORING = 'precision'  # TP / (TP + FP)
# SCORING = 'recall'     # TP / (TP + FN)
# SCORING = 'average_precision'
SCORING = 'roc_auc'

JOBS = 4
VERBOSE = 0

In [5]:
warnings.filterwarnings("ignore", 'F-score is ill-defined')
warnings.filterwarnings("ignore", 'RuntimeWarning: invalid value encountered in divide')

#Code

In [6]:
def getClassifierProbs2(clf, X, Y, history=2000, future=500,
                       n_iter=10, seed=None):
    """Gets probabilities for a given classifier"""
    results = []
    for (X_train, Y_train,
         X_test, Y_test) in PredictCV_TrainTest(X, Y,  history=history,
                                                future=future,
                                                n_iter=n_iter, seed=seed):

        clf.fit(X_train, Y_train)
        y_predict = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test)
        y_log_prob = clf.predict_proba(X_test)

        results.append({'Y_test': Y_test, 'y_predict': y_predict,
                        'y_prob': y_prob, 'y_log_prob': y_log_prob})
        print '*',
        sys.stdout.flush()
    return results

In [7]:
def results_to_matrix(all_results, start=0, end=None):
    if not end:
        end = len(all_results['LR'])
    allX = []
    allY = []
    for i in range(start, end):
        composite = []
        for k in all_results.keys():
            composite.append(all_results[k][i]['y_predict'] + 0.0)
            composite.append(1.0 - all_results[k][i]['y_predict'])
            composite.append(all_results[k][i]['y_prob'])
            #composite.append(all_results[k][i]['y_prob'][:,1] -
            #                 all_results[k][i]['y_prob'][:,0]
            #                )

        allX.append(np.vstack(composite).transpose())
        allY.append(all_results['LR'][i]['Y_test'])

    allX = np.vstack(allX)
    allY = np.hstack(allY)
    print allX.shape, allY.shape
    return allX, allY

# Load Data

In [8]:
%%capture
Y, X = get_dataset(PROJECT, IMPORTANCE)

In [9]:
scaler = MinMaxScaler()
#X_scaled = scaler.fit_transform(X)

In [10]:
n_feats = 2000
newX = SelectKBest(chi2, k=n_feats).fit_transform(scaler.fit_transform(X), Y)

  chisq /= f_exp


### Select Split Strategy

In [11]:
cv = PredictCV(len(Y), history=2000, future=500, n_iter=10, seed=87654321)

In [12]:
all_classifiers = {
    'LR': LogisticRegression(
        C=0.31622776601683794, class_weight='auto', dual=False,
        fit_intercept=True, intercept_scaling=1, max_iter=100,
        multi_class='ovr', penalty='l2', random_state=None,
        solver='liblinear', tol=0.0001, verbose=0),
    'LR5': LogisticRegression(
        C=0.31622776601683794, class_weight={1:5}, dual=False,
        fit_intercept=True, intercept_scaling=1, max_iter=100,
        multi_class='ovr', penalty='l2', random_state=None,
        solver='liblinear', tol=0.0001, verbose=0),
    'LR-none': LogisticRegression(
        C=0.31622776601683794, class_weight=None, dual=False,
        fit_intercept=True, intercept_scaling=1, max_iter=100,
        multi_class='ovr', penalty='l2', random_state=None,
        solver='liblinear', tol=0.0001, verbose=0),
    'svc': SVC(C=7.743, cache_size=200, class_weight='auto', coef0=0.0,
              degree=3, gamma=0.0043, kernel='rbf', max_iter=-1,
              probability=True, random_state=None, shrinking=True, tol=0.001,
              verbose=False),
    'svc - cal': CalibratedClassifierCV(
        SVC(C=7.743, cache_size=200, class_weight='auto', coef0=0.0,
              degree=3, gamma=0.0043, kernel='rbf', max_iter=-1,
              probability=True, random_state=None, shrinking=True, tol=0.001,
              verbose=False),
        cv=3, method='isotonic'),
    'svclin': SVC(C=7.743, cache_size=200, class_weight='auto', coef0=0.0,
              degree=3, gamma=0.0043, kernel='linear', max_iter=-1,
              probability=True, random_state=None, shrinking=True, tol=0.001,
              verbose=False),
    'svclin - cal': CalibratedClassifierCV(
        SVC(C=7.743, cache_size=200, class_weight='auto', coef0=0.0,
              degree=3, gamma=0.0043, kernel='linear', max_iter=-1,
              probability=True, random_state=None, shrinking=True, tol=0.001,
              verbose=False),
        cv=3, method='isotonic'),
    'adaboost': sklearn.ensemble.AdaBoostClassifier(
               algorithm='SAMME',
               base_estimator=None, learning_rate=1.2,
               n_estimators=40, random_state=None),
    'adaboost - cal': CalibratedClassifierCV(
        sklearn.ensemble.AdaBoostClassifier(
               algorithm='SAMME',
               base_estimator=None, learning_rate=1.2,
               n_estimators=40, random_state=None),
        cv=3, method='isotonic'),
    'GaussianNB': GaussianNB(),
    'nb - cal': CalibratedClassifierCV(
        GaussianNB(),
        cv=3, method='isotonic'),
}

#Get Results for all Classifiers

In [13]:
all_results = {}

for clf_name, clf in all_classifiers.items():
    print 'Classifier:', clf_name
    results = getClassifierProbs(clf,
                                 SelectKBest(chi2, k=n_feats).fit_transform(scaler.fit_transform(X), Y),
                                 Y, history=2000, future=500, n_iter=100, seed=12345678)
    all_results[clf_name] = results
    print
    sys.stdout.flush()


Classifier: adaboost - cal
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Classifier: nb - cal
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Classifier: svclin - cal
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Classifier: LR5
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Classifier: GaussianNB
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 

In [20]:
if True:
    with open('all_probability_results2.pkl', 'wb') as output:
        pickle.dump(all_results, output)
        print 'Done'

Done
