# Testing regressions models: Linear Regression, Batch Gradient Descent, Stochastic Gradient Descent, and SVM Regression

Workbook setup code copied from in-course examples.

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [0]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(98)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_linear_models"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# Import data

In [0]:
#code to authorize access to Google Drive
from google.colab import auth
auth.authenticate_user()

In [0]:
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [0]:
#code to impport data
#worksheet = gc.open('Dummy data_reduced').sheet1

worksheet = gc.open('FData_explored_full').sheet1

# Create data frame from imported data

In [0]:
# Read contents of CSV file
df_rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
cols = df_rows[0]
df  = pd.DataFrame.from_records(df_rows, columns = cols)

#drop the duplicate header row
df = df.reindex(df.index.drop(0))

In [0]:
#set target variable column name
target = df_rows[0][0]
#set column names of all features
features = df_rows[0][1:]

import numpy as np

# Prepare the data
X = np.c_[df[features]]
y = np.c_[df[target]]

In [0]:
X=X.astype(float)
y=y.astype(float)

Data comes in as mixed categorical and numerical values. Need to tell the dataframe that. If not, everything comes in as object datatype, which makes processing difficult later

In [0]:
#from sklearn.pipeline import Pipeline
#from sklearn.compose import ColumnTransformer
#from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import StandardScaler, OneHotEncoder

#create the preprocessing pipeline for both numeric and categorical data.
#numeric_features = ['Eng_Response_time (minutes)', 
#                    'Eng_Fire_Under_Control_Time (minutes)', 
#                    'Eng_Last_TFS_Unit_Clear_Time (minutes)']
#numeric_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='median')),
#    ('scaler', StandardScaler())])

#categorical_features = ['Extent_Of_Fire', 'Fire_Alarm_System_Operation', 
#                        'Incident_Ward', 'Method_Of_Fire_Control', 
#                        'Possible_Cause', 'Property_Use', 
#                        'Smoke_Alarm_at_Fire_Origin', 
#                        'Sprinkler_System_Operation', 
#                        'Status_of_Fire_On_Arrival']
#categorical_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


#preprocessor = ColumnTransformer(
#    transformers=[
#        ('num', numeric_transformer, numeric_features),
#        ('cat', categorical_transformer, categorical_features)])


Split dataset into training and test data

In [0]:
#split dataset into 80/20 dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5900, 90), (1476, 90), (5900, 1), (1476, 1))

# Normalize data

In [0]:
#from sklearn.preprocessing import StandardScaler
# Define Scaling technique
#scaler = StandardScaler()

In [0]:
# Train escaling object 
#X_train_escaler = scaler.fit(X_train)

# Apply scaling model to the data
#X_train_escaled = X_train_escaler.transform(X_train)
#X_train_escaled

# Test Linear Regression model

In [0]:
#import sklearn linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(X_train, y_train)
lr.intercept_, lr.coef_

(array([5.30230897e+09]),
 array([[-6.85965784e+02, -6.85837906e+02, -6.85886226e+02,
         -6.85951955e+02, -6.85964706e+02, -6.85868840e+02,
         -6.85972312e+02, -6.85918784e+02, -6.85924026e+02,
         -6.85963501e+02, -6.85924500e+02, -6.85930398e+02,
         -6.85790147e+02, -6.85908528e+02, -6.86071945e+02,
         -6.85908424e+02, -6.85900867e+02, -6.85989240e+02,
         -6.86092920e+02, -6.86113537e+02, -6.85971752e+02,
         -6.86040577e+02, -6.86105879e+02, -6.85878310e+02,
         -6.85788798e+02, -6.85877550e+02, -6.86017409e+02,
         -6.86074981e+02, -6.86061140e+02, -6.86098668e+02,
         -6.86091068e+02, -6.86031757e+02, -6.85972473e+02,
         -6.85930235e+02, -6.85893803e+02, -6.85866535e+02,
         -6.85949047e+02, -6.85777492e+02, -6.85790954e+02,
         -6.85790429e+02, -6.85861137e+02, -6.85879372e+02,
         -6.85872147e+02, -6.85888284e+02, -6.85914331e+02,
         -2.74576178e+09, -2.74576177e+09, -2.74576177e+09,
          1.76

In [0]:
lr.score(X_test, y_test)

0.31601942037496766

# Test Batch Gradient Descent model

In [0]:
eta = 0.1
n_iterations = 1000
m = len(X_train)
theta = np.random.randn(len(X_train[0]),1)

In [0]:
for iteration in range(n_iterations):
    gradients = 2/m * X_train.T.dot(X_train.dot(theta) - y_train)
    theta = theta - eta * gradients

  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
theta

# Test Stochastic Gradient Descent model

In [0]:
theta_path_sgd = []
m = len(X_train)
np.random.seed(42)

In [0]:
n_epochs = 50
t0, t1 = 5, 50  # learning schedule hyperparameters

def learning_schedule(t):
    return t0 / (t + t1)

theta = np.random.randn(len(X_train[0]),1)  # random initialization

for epoch in range(n_epochs):
    for i in range(m):
        random_index = np.random.randint(m)
        xi = X_train[random_index:random_index+1]
        yi = y_train[random_index:random_index+1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta_path_sgd.append(theta)

  
  app.launch_new_instance()


In [0]:
theta

# Test Support Vector Machines

In [0]:
from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=1.5, random_state=42)
svm_reg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LinearSVR(C=1.0, dual=True, epsilon=1.5, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=42, tol=0.0001, verbose=0)

In [0]:
svm_reg.score(X_test, y_test)

0.14401759067097286

# Tune SVM regression hyperparameters

In [0]:
svm_reg2 = LinearSVR(epsilon=1.3, random_state=42)
svm_reg2.fit(X_train, y_train)
svm_reg2.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.17673933316195578

In [0]:
svm_reg3 = LinearSVR(epsilon=1.1, random_state=42)
svm_reg3.fit(X_train, y_train)
svm_reg3.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.1535655471989784

# Conclusion

Regression models seemed to perform poorly on this dataset.

- Target variable actually behaves more like categories despite having numerical values. I.e. dollar amounts are typically rounded to the nearest thousand, ten thousand, hundred thousand etc.
- Too many categorical features. We did not do enough exploration to estimate the potential predicting power of these features. We likely did not choose useful features to keep.
- Features were not scaled. Did not figure out how to scale only some of the numerical features while leaving the categorical features alone.
- Did not try non-linear regression. Already too many features so were concerned with computational cost. Also did not have time to try it.