# saving demo for sklearn models

In [23]:
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression, Lasso
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, RobustScaler
import sklearn.model_selection 
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [24]:
# load training data
Xtr_path = 'Xtr.csv'
ytr_path = 'ytr.csv'
Xts_path = 'Xts.csv'

Xtr = np.loadtxt(Xtr_path, delimiter=",")
ytr = np.loadtxt(ytr_path, delimiter=",")
Xts = np.loadtxt(Xts_path, delimiter=",")

In [25]:
# Standardize the data

scaler = StandardScaler() # could also use RobustScaler() here!
Xtr = scaler.fit_transform(Xtr) # compute standardization from training data and apply to training data
Xts = scaler.fit_transform(Xts) # compute standardization from testing data and apply to testing data

In [26]:
# Feature Selection

import sklearn.model_selection 

# Manual approach using 2 for-loops

# Create a k-fold cross validation object
nfold = 10
kf = sklearn.model_selection.KFold(n_splits=nfold,shuffle=True,random_state=2)

# Create the LASSO model.  We use the `warm start` parameter so that the fit will start at the previous value.
# This speeds up the fitting.
lasso = Lasso(fit_intercept=False,warm_start=True)

# Regularization values to test
nalpha = 100
alphas = np.logspace(-3,1,nalpha)

# MSE for each alpha and fold value
mse = np.zeros((nalpha,nfold))
for ifold, ind in enumerate(kf.split(Xtr)):
    
    # Get the training data in the split
    Itr,Its = ind
    X_tr = Xtr[Itr,:]
    y_tr = ytr[Itr]
    
    # Compute the lasso path for the split
    for ia, a in enumerate(alphas):
        
        # Fit the model on the training data
        lasso.alpha = a
        lasso.fit(X_tr,y_tr)
        
        # Compute the prediction error on the test data
        y_tr_pred = lasso.predict(X_tr)
        mse[ia,ifold] = np.mean((y_tr_pred-y_tr)**2)

# Compute the MSE mean over the folds and its standard error
mse_cv = np.mean(mse,axis=1)
mse_se = np.std(mse,axis=1,ddof=1) / np.sqrt(nfold)

# Find the minimum MSE
imin = np.argmin(mse_cv)
alpha_min = alphas[imin]
mse_cv_lasso = mse_cv[imin]

# Manual approach to refitting LASSO on entire training data
lasso.alpha = alpha_min
lasso.fit(Xtr,ytr)
print('Manual LASSO refit:')
print(" intrcpt %f" % lasso.intercept_)
for i, c in enumerate(lasso.coef_):
    print("%8s %f" % (i, c))


Manual LASSO refit:
 intrcpt 0.000000
       0 0.710419
       1 -1.194056
       2 1.003718
       3 0.828824
       4 -0.119656
       5 2.336672
       6 0.304381
       7 -0.929152
       8 0.008433
       9 -0.044465
      10 -0.139905
      11 0.060662
      12 0.074280
      13 0.066365
      14 -0.020887
      15 -0.066928


In [27]:
# Sort coefficient
np.sort(abs(lasso.coef_))

array([0.00843319, 0.02088663, 0.04446468, 0.0606617 , 0.06636467,
       0.06692842, 0.07427952, 0.11965633, 0.13990481, 0.30438119,
       0.71041853, 0.82882422, 0.92915234, 1.00371826, 1.19405587,
       2.33667166])

In [28]:
# Select important features 
Xtr = Xtr[:,[0,1,2,3,4,5,6,7]]
Xts = Xts[:,[0,1,2,3,4,5,6,7]]

In [None]:
# Best Model

svc = svm.SVC(probability=False, kernel="rbf", C=47.5, gamma=1.05, verbose=1)
#svc = svm.SVC(probability=False, kernel="rbf", C=47.25, gamma=1.025, verbose=1) 91.760
svc.fit(Xtr,ytr)

acc = np.mean(svc.predict(Xtr)==ytr)
print('training accuracy: ',acc)

In [12]:
import pandas as pd
np.savetxt("Xtr2.csv", Xtr, delimiter=",")
np.savetxt("Xts2.csv", Xts, delimiter=",")

In [13]:
# save the model using the extension .pkl 
save_path = 'model.pkl'
pickle.dump(svc, open(save_path, 'wb'))

In [14]:
# verify that the saved model works with the validation script
Xts_path = 'Xts2.csv' # custom test features
Xtr_path = 'Xtr2.csv' # custom training features
!python {"validation.py " + save_path + " --Xts_path " + Xts_path + " --Xtr_path " + Xtr_path}

training accuracy =  0.9945
test label predictions saved in yts_hat.csv
