In [1]:
"""
Multiple Linear Regression Model

1.    Consider the columns, ‘RM’, ‘DIS’, ‘TAX’, ‘INDUS’ as predictors, and ‘MEDV’ as the target variable

2.    Calculate the Variance Inflation Factor for each of the selected predictors. Based on the VIF factor, finalize the list of predictors.

3.    For every predictor identified, visualize its association with the target column using scatter plot.

4.    Split the data into train and test datasets, in the ratio of 67:33.

5.    Build a Linear Regression model, to predict the target variable using the selected predictors.

6.    Observe the coefficients and intercept values for the model. 

7.    Evaluate the model using mean squared error values, R-squared values, and adjusted R-Squared values, on the train and the test data.

"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


boston_housing = pd.read_csv("/Users/thushan/Downloads/datasets/boston_housing.csv")

x = boston_housing[["RM","DIS","TAX","INDUS"]]
y = boston_housing["MEDV"]

np.corrcoef(boston_housing["RM"], boston_housing["DIS"])

from statsmodels.stats.outliers_influence import variance_inflation_factor
#calculating the VIF for each attributes
vif = pd.Series([variance_inflation_factor(x.values,idx) 
           for idx in range(x.shape[1])],
          index=x.columns)
print(vif)

# RM and TAX are too high, use DIS and INDUS as the predictor variables

#plt.scatter(boston_housing["DIS"], boston_housing["MEDV"], color = "blue", marker = ".")
#plt.scatter(boston_housing["INDUS"], boston_housing["MEDV"], color = "blue", marker = ".")

x_new = boston_housing[["DIS","INDUS"]]
y = boston_housing["MEDV"]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_new, y, train_size = 0.67, random_state = 0)

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train,y_train)

print("Model coefficient: ", model.coef_)
print("Model intercept: ", model.intercept_)

R2_train = model.score(x_train,y_train)
R2_test = model.score(x_test,y_test)

print("R2 train: ", R2_train)
print("R2 test: ", R2_test)

from sklearn.metrics import mean_squared_error

test_predictions = model.predict(x_test)
test_RMSE = mean_squared_error(y_test,test_predictions)**0.5

train_predictions = model.predict(x_train)
train_RMSE = mean_squared_error(y_train,train_predictions)**0.5

print("Train RMSE", train_RMSE)
print("Test RMSE", test_RMSE)


model1 = LinearRegression()

features = ["RM"]
target = ["MEDV"]
model1.fit(boston_housing[features],boston_housing[target])
print(model1.score(boston_housing[features],boston_housing[target]))

model2 = LinearRegression()
features = ["RM","TAX"]
target = ["MEDV"]
model2.fit(boston_housing[features],boston_housing[target])
print(model2.score(boston_housing[features],boston_housing[target]))

X = boston_housing[features]
y = boston_housing[target]
adjusted_rscore = 1 - (1-model2.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)
print("Adjusted R2 score:", adjusted_rscore)

RM       16.747965
DIS       6.880607
TAX      13.861510
INDUS     9.598374
dtype: float64
Model coefficient:  [-0.87622458 -0.8498507 ]
Model intercept:  35.540505419718116
R2 train:  0.27589603455671663
R2 test:  0.19052213277175478
Train RMSE 7.893119796595275
Test RMSE 8.078479851195771
0.4835254559913343
0.5605639377690896
Adjusted R2 score: 0.5588166770842748
