In [1]:
import numpy
from numpy import arange
import pandas as pd
from pandas import read_csv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

## Load the data

In [2]:
# Load from a CSV file for educational purpose
boston_housing = "https://raw.githubusercontent.com/noahgift/boston_housing_pickle/master/housing.csv"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = read_csv(boston_housing, delim_whitespace=True, names=names)

In [3]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
prices = df['MEDV']
df = df.drop(['CRIM','ZN','INDUS','NOX','AGE','DIS','RAD'], axis = 1)
features = df.drop('MEDV', axis = 1)
df.head()

Unnamed: 0,CHAS,RM,TAX,PTRATIO,B,LSTAT,MEDV
0,0,6.575,296.0,15.3,396.9,4.98,24.0
1,0,6.421,242.0,17.8,396.9,9.14,21.6
2,0,7.185,242.0,17.8,392.83,4.03,34.7
3,0,6.998,222.0,18.7,394.63,2.94,33.4
4,0,7.147,222.0,18.7,396.9,5.33,36.2


## Modeling

In [5]:
# Split-out validation dataset
array = df.values
X = array[:,0:6]
Y = array[:,6]
validation_size = 0.20
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [6]:
for sample in list(X_test)[0:2]:
    print(f"X_validation {sample}")

X_validation [  1.      6.395 666.     20.2   391.34   13.27 ]
X_validation [  0.      5.895 224.     20.2   394.81   10.56 ]


## Linear Regression

In [7]:
# Create linear regression object
linreg = LinearRegression()

# Train the model using the training sets
linreg.fit(X_train,Y_train)

# Predict the values using the model
predictions = linreg.predict(X_test)

print("Mean Squared Error: \n")
print(mean_squared_error(Y_test, predictions))

Mean Squared Error: 

39.95883743552906


## Stochastic Gradient Descent

In [8]:
sgdreg = SGDRegressor(penalty='l2', alpha=0.15)

# Train the model using the training sets
sgdreg.fit(X_train, Y_train) 

# Predict the values using the model
predictions = sgdreg.predict(X_test)

print("Mean Squared Error: \n")
print(mean_squared_error(Y_test, predictions))

Mean Squared Error: 

4.724156104223365e+28


## Gradient Boosting Regressor

In [9]:
# prepare the model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = GradientBoostingRegressor(random_state=seed, n_estimators=400)
model.fit(rescaledX, Y_train)

# transform the validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model.predict(rescaledValidationX)
print("Mean Squared Error: \n")
print(mean_squared_error(Y_test, predictions))

Mean Squared Error: 

26.086121710797425


## Generate the joblib file

In [10]:
import joblib
# save the pre-trained Model to a file
joblib.dump(linreg, 'LinearRegression.joblib')
joblib.dump(sgdreg, 'StochasticGradientDescent.joblib')
joblib.dump(model, 'GradientBoostingRegressor.joblib')

['GradientBoostingRegressor.joblib']

## How to reload the Joblib file

In [15]:
# Load the pre-trained Model from a file
sample_model = joblib.load('LinearRegression.joblib')
# Make predictions
predictions = sample_model.predict(X_test)

In [16]:
predictions=predictions.astype(int)
evaluate = pd.DataFrame({
        "Org House Price": Y_test,
        "Pred House Price": predictions
    })
evaluate["difference"] = evaluate["Org House Price"]-evaluate["Pred House Price"]
evaluate.head()

Unnamed: 0,Org House Price,Pred House Price,difference
0,21.7,24,-2.3
1,18.5,20,-1.5
2,22.2,20,2.2
3,20.4,20,0.4
4,8.8,11,-2.2


In [13]:
evaluate.describe()

Unnamed: 0,Org House Price,Pred House Price,difference
count,102.0,102.0,102.0
mean,22.573529,22.117647,0.455882
std,9.033622,8.758921,5.154438
min,6.3,8.0,-34.1
25%,17.35,17.0,-0.8
50%,21.8,20.5,0.6
75%,24.8,25.0,2.2
max,50.0,56.0,22.0
