# <center> AWS-foryou </center>
### <center> examples </center>
---

### Example 1
Running sklearndiabetes.py as the user's algorithm.

In [59]:
import numpy as np
import os
import pandas as pd
from sklearn import datasets
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import time

os.chdir("..")

import awsforyou

In [23]:
def get_diabetes(multiplier=1):
    x,y = datasets.load_diabetes(return_X_y=True)
    
    means = np.mean(x, axis=0)
    std = np.std(x)
    y_mean = np.mean(y)
    y_std = np.std(y)
    
    n,d = x.shape
    
    new_x = x
    new_y = y
    
    for i in range(0,multiplier):
        mock_x = x + np.random.normal(loc=0, scale=std, size=(n, d))
        mock_y = y + np.random.normal(loc=0, scale=y_std, size=(n, 1)).ravel()

        new_x = np.append(new_x, mock_x, axis=0)
        new_y = np.append(new_y, mock_y, axis=0)
    return new_x, new_y

In [36]:
x,y = get_diabetes(2)

In [37]:
np.savetxt("x_diabetes.csv", x, delimiter=",")

In [38]:
np.savetxt("y_diabetes.csv", y, delimiter=",")

In [39]:
data_loc = './x_diabetes.csv'
target_loc = './y_diabetes.csv'

In [40]:
def sklearn_diabetes(data_loc, target_loc):
    x = np.array(pd.read_csv(data_loc))
    y = np.array(pd.read_csv(target_loc))
    
    X_train, X_test, y_train, y_test = train_test_split(x,y.ravel(), random_state=0)
    
    # Standardize the data
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Fit regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    linear_score = model.score(X_test, y_test)

    print("linear regression score = %f" % linear_score)

    # Fit svr model
    parameters = \
        {
            'gamma':('scale', 'auto'),
            'kernel':('linear', 'rbf', 'poly', 'sigmoid'),
            'C':[0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 5, 10, 20, 30, 40, 50],
            'degree':[3,4,5,6,7,8],
        }

    svr = SVR()
    grid_svr = GridSearchCV(svr, parameters, cv=5, n_jobs=-1, iid=False)
    grid_svr.fit(X_train, y_train)

    best_estimator = grid_svr.best_estimator_
    print("best hyperparameters estimate from grid search = \n %s " % best_estimator)

    best_estimator_score = grid_svr.best_score_
    print("score from using best hyperparameters = %f" % best_estimator_score)

    print("begining 6-components PCA decomposition")

    components = 6
    pca = PCA(n_components = components, svd_solver='full')
    pca.fit(X_train)
    varratio = np.sum(pca.explained_variance_ratio_)

    print("percentage of variance explained = %f" % varratio)

    pca_X_train = pca.transform(X_train)
    pca_X_test = pca.transform(X_test)

    print("repeat grid search with PCA-transformed data")

    # Fit svr model
    parameters = \
        {
            'gamma':('scale', 'auto'),
            'kernel':('linear', 'rbf', 'poly', 'sigmoid'),
            'C':[0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 5, 10, 20, 30, 40, 50],
            'degree':[3,4,5,6,7,8],
        }

    svr = SVR()
    grid_svr = GridSearchCV(svr, parameters, cv=5, n_jobs=-1, iid=False)
    grid_svr.fit(pca_X_train, y_train)

    best_estimator = grid_svr.best_estimator_
    print("best hyperparameters estimate from grid search = \n %s " % best_estimator)

    best_estimator_score = grid_svr.best_score_
    print("score from using best hyperparameters = %f" % best_estimator_score)

In [42]:
start = time.time()
sklearn_diabetes(data_loc, target_loc)
finish = time.time()
runtime = finish - start
print("runtime is %f seconds" % runtime)

linear regression score = 0.247268
best hyperparameters estimate from grid search = 
 SVR(C=20, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 
score from using best hyperparameters = 0.237037
begining 6-components PCA decomposition
percentage of variance explained = 0.780063
repeat grid search with PCA-transformed data
best hyperparameters estimate from grid search = 
 SVR(C=40, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 
score from using best hyperparameters = 0.229473
runtime is 107.853706 seconds


'/mnt/c/Users/winsu/Documents/Seattle/U-of-Washington/Masters-in-Data-Science/2019_Spring/DATA_515/final-project/AWS-foryou/examples'