In [2]:
# importing packages
import pickle
import numpy as np
from sklearn import linear_model
import sklearn.metrics as sm
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# loading the data
input_file = 'data_multivar_regr.txt'

In [6]:
# reading the data
data = np.loadtxt(input_file, delimiter=',')
X=data[]

array([[ 2.06,  3.48,  7.21, 15.69],
       [ 6.37,  3.01,  7.27, 15.34],
       [ 1.18,  1.2 ,  5.42,  0.66],
       ...,
       [ 3.61,  2.22,  5.77,  6.33],
       [ 1.38,  2.69, -0.33, 26.66],
       [ 5.35,  5.  ,  5.08, 25.13]])

In [9]:
# splitting the data into training and testing sets 80 : 20 split
training_no = int(0.8 * len(X))
testing_no = int(0.2 * len(X))

# picking the training data
X_train, Y_train = X[:training_no], Y[:training_no]

# picking the testing data
X_test, Y_test = X[training_no:], Y[training_no:]


In [10]:
# creating the regression model object
linear_regressor = linear_model.LinearRegression()

# training the model
linear_regressor.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
# predicting the output
Y_test_predict = linear_regressor.predict(X_test)

In [12]:
# computing the performance metrics
print('linear regressor performance')
print('Mean absolute error = ',round(sm.mean_absolute_error(Y_test, Y_test_predict),2))
print('Mean squared error = ',round(sm.mean_squared_error(Y_test, Y_test_predict),2))
print('Median absolute error =',round(sm.median_absolute_error(Y_test, Y_test_predict),2))
print('Explain variance score =', round(sm.explained_variance_score(Y_test, Y_test_predict), 2))
print("R2 score =", round(sm.r2_score(Y_test, Y_test_predict), 2))

linear regressor performance
Mean absolute error =  3.58
Mean squared error =  20.31
Median absolute error = 2.99
Explain variance score = 0.86
R2 score = 0.86


In [16]:
# creating and training a polynomial regressor of degree 10 for the multivariable independent variables

# creating the polynomial regresssor
polynomial = PolynomialFeatures(degree=10)

# transforming to a polynomial
X_train_transformed = polynomial.fit_transform(X_train)

# data to be used to predict Y and compare which is better in prediction either linear or polynomial regression
datapoint = [[7.75, 6.35, 5.56]]

# transforming the datapoint to a polynomial for the polynomial model to understand it
poly_datapoint = polynomial.fit_transform(datapoint)

# creating a polynomial linear model object
poly_linear_model = linear_model.LinearRegression()

# training the polynomial model
poly_linear_model.fit(X_train_transformed, Y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
# predicting the dependent variable Y based on the datapoint values
# predicting with linear regression
output1 = linear_regressor.predict(datapoint)

# predicting with polynomial regression
output2 = poly_linear_model.predict(poly_datapoint)


In [18]:
# printing the prediction of the respective regression model
print('Linear regression model')
print(output1)
print('Polynomial regression model')
print(output2)

Linear regression model
[36.05286276]
Polynomial regression model
[41.45234835]


In [27]:
# saving the model to be used for prediction
# since there are two models, we shall be saving the polynomial model since it is better in prediction
# model persistence
regressor_model_file = 'multivariable_regressor_model.pkl'

# save the model
with open(regressor_model_file, 'wb') as f:
    pickle.dump(poly_linear_model, f)

In [26]:
# loading the saved model
with open(regressor_model_file, 'rb') as f:
    regressor_model = pickle.load(f)

In [25]:
regressor_model.predict(poly_datapoint)

array([41.45234835])