In [1]:
# import dependencies and global settings
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate, cross_val_predict
from sklearn.metrics import mean_absolute_error
import pickle

In [2]:
# load linear features (X) and target variable (y) dataframes
X = pd.read_csv('./data_frames/housing_X_features.csv', index_col=0)
y = pd.read_csv('./data_frames/housing_y_target.csv', index_col=0)

In [3]:
# dummy encode all categorical features
X = pd.get_dummies(X, drop_first=True)

In [4]:
# cross validation on train set
lr_base = LinearRegression()
k=5
cv = KFold(n_splits=k, shuffle=True, random_state=12)
cv_results = cross_validate(lr_base, X, y, cv=cv, return_train_score=True, return_estimator=True)
cv_pred = cross_val_predict(lr_base, X, y, cv=cv)

for test_score in cv_results['test_score']:
    print(test_score)

print('Mean Score (r^2)=' + str(cv_results['test_score'].mean()))
print('Mean Absolute Error (cross validation):' + str(mean_absolute_error(y, cv_pred)))

0.9028933450524776
-318545.90356938815
-171.4842837493234
-17550102.27134023
-219105542.0635812
Mean Score (r^2)=-47394872.163976245
Mean Absolute Error (cross validation):19639414.219514396


In [5]:
#fit saved model to entire data set
lr_baseline = LinearRegression().fit(X, y)

In [6]:
# save lr_baseline to pickle file
filename = './models/lr_baseline.pkl'
outfile = open(filename,'wb')
pickle.dump(lr_baseline, outfile)
outfile.close()