Prepare the data.

In [1]:
import pandas as pd
import csv

Transform raw data file into CSV file.

In [2]:
def parse_data(data_file, out_file):
    with open(data_file) as df:
        list=[]
        with open(out_file, mode='w') as training:
            data_writer = csv.writer(training, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for x in range(0,37):
                if x==0:
                    feature="class"
                else:
                    feature ="feature"+str(x)
                list.append(feature)
            data_writer.writerow(list)
        line = df.readline()
        while line:
            data = line.split()
            parsedDataList=[]
            for x in range(0,37):
                parsedDataList.append("")

            for item in data:
                index=item.find(":")
                if index>0:
                    featureNumber=item[0:index]
                    featureData=item[index+1:]
                    parsedDataList[int(featureNumber)]=featureData
                else:
                    parsedDataList[0]=item
            with open(out_file, mode='a') as training:
                data_writer = csv.writer(training, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                data_writer.writerow(parsedDataList)
            line = df.readline()

In [3]:
parse_data("satimage.scale.training" , 'parsed_training.csv')
parse_data("satimage.scale.t" , 'parsed_testing.csv')

In [2]:
df_train = pd.read_csv('parsed_training.csv')
df_test  = pd.read_csv('parsed_testing.csv')

In [3]:
X = df_train.drop(['class'],axis=1)
y = df_train[['class']]

Normalize the data.

In [4]:
normal_df_train=(df_train-df_train.mean())/df_train.std()
normal_df_test=(df_test-df_test.mean())/df_test.std()

In [5]:
X =(X - X.mean())/ X.std()

In [6]:
X=X.fillna(0)

In [7]:
class_dict = {1:0, 2:0, 3:0, 4:0, 5:0, 6:1}
y=y['class'].map(class_dict)

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Imputer


def cross_validate_single_kernel(X, y, kernel, n_folds, args):
    
    #auc = []
    error = []
    nsv = 0
    nsvm = 0 
    skf=StratifiedKFold(n_splits=n_folds, shuffle=True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()

        ##train/test
        svm = SVC(C=args["c_val"], kernel=kernel, tol=1e-7, shrinking=False, degree = args["d_val"], gamma = 'auto')
        svm.fit(X_train, y_train)

        error.append(len(y_test)-((svm.score(X_test, y_test))*len(y_test)))
        sv_list   =  svm.support_
        svx       =  svm.support_vectors_
        beta_list = abs(np.float32((svm.decision_function(svx))))
        #m = max(beta_list)
        index = [i for i, j in enumerate(beta_list) if j == 1]

        #pred   =   svm.predict(X_test)
        #auc.append(roc_auc_score(y_test, pred))
        nsv   +=   np.mean(svm.n_support_)/n_folds
        nsvm  +=   len(index)/n_folds
        beta_sortr= np.sort((svm.decision_function(svx)))
        beta_sort = np.sort(beta_list)
    
    return error, nsv, nsvm, beta_sortr, beta_sort, max(beta_list)

In [9]:
c_vals     = 5
d_vals     = 10
cross_validate_single_kernel(X, y, "poly", 10, {"c_val":c_vals, "d_val":d_vals})

NameError: name 'np' is not defined

Make it Parallel

In [13]:
from joblib import Parallel, delayed
import multiprocessing 
from itertools import product

num_processes = 8

def cross_validate_kernels(X, y, kernel, n_folds, args):
    
    for key in args.keys():
        if len(args[key])==0:
            args[key]=[1]

    results = Parallel(n_jobs=num_processes)(delayed(cross_validate_single_kernel)\
              (X, y, kernel, n_folds,{"c_val":c_val, "d_val":d_val})\
              for (c_val, d_val) in product(args["c_vals"], args["d_vals"]))
    
    names   = ["kernel:%s__log10_C:%s__degree:%s"%(kernel, str(np.log10(c_val)), str(d_val))\
              for (c_val, d_val) in product(args["c_vals"], args["d_vals"])]
    
    result_dict = dict(zip(names, results))
        
    return result_dict

In [14]:
import os

rslt_addr = "./results/HW2T1_1/ada3"
if not os.path.exists(rslt_addr):
    os.makedirs(rslt_addr)

In [15]:
def name_creator(kernel, n_folds, args):

    name = "kernel%s__n_folds%d"%(kernel, n_folds)
    
    def describe_in_string(vec, is_degree=False):
        
        if len(vec)==0:
            return "NA"
        
        if not is_degree:
            vec = np.log10(vec)
            
        if len(vec)==1:
            return "%d"%vec[0]
    
        return "%d~%d"%(np.min(vec), np.max(vec))
    
    for arg in sorted(args.keys()):
        if arg!="d_vals":
            name += "__log10_%s%s"%(arg, describe_in_string(args[arg]))
        else:
            name += "__%s%s"%(arg, describe_in_string(args[arg], True))
    
    return name

In [16]:
import pickle
import os
import numpy as np

def load_it_or_compute_it (X, y, kernel, n_folds, args):
    
    rslt_dict_name = name_creator(kernel, n_folds, args)
    rslt_dict_addr = os.path.join(rslt_addr, rslt_dict_name)
    
    print (rslt_dict_name)
    
    if os.path.isfile(rslt_dict_addr):
        print ("FOUND!")
        with open(rslt_dict_addr,"rb") as rslt_dict_handle:
            rslt_dict = pickle.load(rslt_dict_handle)
    else:
        rslt_dict = cross_validate_kernels(X, y, kernel, n_folds, args)
        with open(rslt_dict_addr,"wb") as rslt_dict_handle:
            pickle.dump(rslt_dict, rslt_dict_handle)

    return rslt_dict

In [19]:
c_vals     = np.power(float(5), range(-5, 5 + 1))
d_vals     = range( 1, 4 + 1)

load_it_or_compute_it (X, y, "poly", 10, {"c_vals":c_vals, "d_vals":d_vals})

kernelpoly__n_folds10__log10_c_vals-5~5__d_vals1~4


KeyError: 32

In [20]:
rslt_dict = load_it_or_compute_it (X, y, "poly", 10, {"c_vals":c_vals, "d_vals":d_vals})
value_list = list(rslt_dict.items())

kernelpoly__n_folds10__log10_c_vals-5~5__d_vals1~4


KeyError: 35

In [None]:
test_list = np.reshape(value_list,(11, 8))
test_list = np.delete(test_list, [0 ,2, 4, 6] , axis = 1)
##test_list = test_list.astype(np.float)
test_list = np.asfarray(test_list,float)

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':18})
import numpy as np

y_label = np.linspace(0,1.0,num=11)

def plot_errorC(c_vals, list):
    plt.figure(figsize = (16,10))
    plt.plot(range(len(c_vals)), [(i[0]) for i in list], color = 'g', label='d = 1')
    plt.plot(range(len(c_vals)), [(i[1]) for i in list], color = 'm', label='d = 2')
    plt.plot(range(len(c_vals)), [(i[2]) for i in list], color = 'c', label='d = 3')
    plt.plot(range(len(c_vals)), [(i[3]) for i in list], color = 'y', label='d = 4')
    
    plt.title('SVM Cross-Validation Error for Penalty Parameter C')
    plt.xlabel('C')
    ##plt.yticks(range(len(y_label)), y_label)
    plt.ylabel('Error')
    plt.xticks(range(len(c_vals)), c_vals, rotation='vertical')
    plt.legend(loc = 'best')
    plt.show()

In [None]:
plot_errorC(c_vals, test_list)

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':18})
import numpy as np

y_label = np.linspace(0,1.0,num=11)

##print_list = float(float(list[10][1::2]))
def plot_errord(d_vals, list):
    plt.figure(figsize = (16,10))
    plt.plot(range(len(d_vals)), (list[10]), color = 'black')
    
    plt.title('SVM Cross-Validation Error for Polynomial Degree d')
    plt.xlabel('d')
    ##plt.yticks(range(len(y_label)), y_label)
    plt.ylabel('Error')
    plt.xticks(range(len(d_vals)), d_vals)
    plt.legend(loc = 'best')
    plt.show()

In [None]:
plot_errord(d_vals, test_list)