In [5]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
# any number will do, as long as it is used consistently
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from sklearn.linear_model import SGDRegressor

## STEP ONE: gather prepped data

In [2]:
prepped_data = pd.read_csv("prepped_data.csv")
prepped_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,0,0,7541
1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,0,-1,5506
2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,0,1,2035
3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,-1,0,3942
4,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,-1,-1,2914


In [3]:
prepped_x = prepped_data.values[:,:16]
prepped_y = prepped_data.values[:,16]
print(prepped_x.shape, prepped_y.shape)

(100046, 16) (100046,)


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(prepped_x, prepped_y, test_size=0.2, random_state=42)
X_train=X_train.astype("float64")
X_test=X_test.astype("float64")
y_train=y_train.astype("float64")
y_test=y_test.astype("float64")

In [6]:
loss = ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
penalty = ['l1', 'l2', 'elasticnet']
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive']

param_grid = {'loss': loss,
              'penalty': penalty,
              'alpha': alpha,
              'learning_rate': learning_rate}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

sgd_model = SGDRegressor(random_state=42)
# train across 5 folds, that's a total of ? rounds of training 
rnd_search = RandomizedSearchCV(sgd_model, param_distributions=param_grid,
                                n_iter=25, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)



In [None]:
final_sgd_model = rnd_search.best_estimator_
rnd_search.best_estimator_

In [None]:
scores = cross_val_score(final_sgd_model, X_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)