In [2]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV

import pickle

import OU

%load_ext autoreload
%autoreload 2

In [3]:
save_dir = '/Users/answer/Desktop/paper/DATA'

In [4]:
info = np.load(save_dir + "/info.npy", allow_pickle=True)

In [36]:
splits = []
combined_df = pd.DataFrame()
combined_labels = pd.Series()

for i in range(len(info)):
    train = info[i]['train']['df_scale'].copy()
    train_labels = info[i]['train']['labels'].copy()
    
    test = info[i]['test']['df_scale'].copy()
    test_labels = info[i]['test']['labels'].copy()
    
    train_len = train.shape[0]
    test_len = test.shape[0]
    
    # Append rows to dataframe
    #multi_cv_df = multi_cv_df.append(train, ignore_index=True)
    #multi_cv_labels = multi_cv_labels.append(train_labels, ignore_index=True)
    
    combined_df = combined_df.append(train, ignore_index=True)
    combined_labels = combined_labels.append(train_labels, ignore_index=True)
    
    
    # Append labels to a dataframe
    combined_df = combined_df.append(test, ignore_index=True)
    combined_labels = combined_labels.append(test_labels, ignore_index=True)
    
    # Append the indices of the folds to a list
    splits.append((combined_df.iloc[-train_len-test_len:-test_len].index, combined_df.iloc[-test_len:].index))
    
    # Quality Assurance
    assert(np.array_equal(combined_df.loc[splits[i][0]].values, train.values))
    assert(np.array_equal(combined_labels.loc[splits[i][0]].values, train_labels.values))
    assert(np.array_equal(combined_df.loc[splits[i][1]], test.values))
    assert(np.array_equal(combined_labels.loc[splits[i][1]], test_labels))
    
splits = np.array(splits)

np.save(save_dir + 'splits.npy', splits)

  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
combined_df.to_csv(save_dir + 'df.csv')
combined_labels.to_csv(save_dir + 'labels.csv')

In [38]:
params = [{ 'kernel': ['rbf'],
            'C': [0.1,1,10,100], 
            'gamma': [1, 0.1, 0.001, 0.0001], 
            'cache_size': [2000], 
            'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4}, 
                             {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}]
          }, 
          { 'kernel': ['poly'], 
            'C': [0.1, 1,10,100,], 
            'gamma': [1, 0.1, 0.001, 0.0001],
            'degree': [3, 5],
            'cache_size': [2000],
            'class_weight': [{0: 0.5, 1: 0.5}, 
                             {0: 0.6, 1: 0.4}, {0: 0.7, 1: 0.3}]
          }]

In [40]:
gridcv = GridSearchCV(svm.SVC(), params, verbose=1, cv=list(splits), n_jobs=-1, 
                    scoring=['precision'], refit=False)

gridcv.fit(combined_df, combined_labels)

Fitting 448 folds for each of 160 candidates, totalling 71680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 948 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done 1848 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2948 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 4248 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 5748 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 7448 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 9348 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 11448 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 13748 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 16248 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 18948 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 21848 tasks   

GridSearchCV(cv=[array([RangeIndex(start=0, stop=2000, step=1),
       RangeIndex(start=2000, stop=2100, step=1)], dtype=object),
                 array([RangeIndex(start=2100, stop=4100, step=1),
       RangeIndex(start=4100, stop=4200, step=1)], dtype=object),
                 array([RangeIndex(start=4200, stop=6200, step=1),
       RangeIndex(start=6200, stop=6300, step=1)], dtype=object),
                 array([RangeIndex(start=6300, s...
             param_grid=[{'C': [0.1, 1, 10, 100], 'cache_size': [2000],
                          'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4},
                                           {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}],
                          'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['rbf']},
                         {'C': [0.1, 1, 10, 100], 'cache_size': [2000],
                          'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4},
                                           {0: 0.7, 1: 0.3}],
                          'd