In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np 
import pandas as pd 
import os

# to make this notebook's output stable across runs
# any number will do, as long as it is used consistently
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
from sklearn.model_selection import train_test_split
# m o d e l s 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score 

## STEP ONE: Gather the prepped data
We are going to get the data that the file data_wrangling outputted. This data is standardized, so the model will be able to predict y hat more accurately. 

In [3]:
prepped_data = pd.read_csv("prepped_data.csv")
prepped_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,0,0,7541
1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,0,-1,5506
2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,0,1,2035
3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,-1,0,3942
4,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0,-1,-1,2914


In [4]:
prepped_x = prepped_data.values[:,:16]
prepped_y = prepped_data.values[:,16]
print(prepped_x.shape, prepped_y.shape)

(100046, 16) (100046,)


## STEP TWO: Split data sets
We will split the data to a train and dev test (here, it is called a test set) to see how the Linear Regression model performs. 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(prepped_x, prepped_y, test_size=0.2, random_state=42)
X_train=X_train.astype("float64")
X_test=X_test.astype("float64")
y_train=y_train.astype("float64")
y_test=y_test.astype("float64")

In [6]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


## STEP THREE: Model
Using cross validation and a score metric of negative mean squared error, we found that the average score of the Linear Regression model to be around <b>63.4%</b>. This is quite horrible. 

In [7]:
scores = cross_val_score(lin_model, X_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
lin_model_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("\n")
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(lin_model_scores)

Scores: [0.63386171 0.64220493 0.63521905 0.62989046 0.63275965 0.63157864
 0.63237326 0.63638339 0.63555342 0.63069614]


Mean: 0.6340520655417548
Standard deviation: 0.003387390007298002
