### What is covered?
#### 1. VIF: Details in Week#3:Linear Regression Assumtions
#### 2. Evaluation Matrix for Regression Algorithms: Details in Week#5

In [1]:
#Load Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
#Load Dataset
data=pd.read_csv('./mtcars.csv')
data.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [3]:
#Drop Model Column
data2=data.drop('model',axis=1)
data2.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [4]:
#VIF Factors
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = data2._get_numeric_data() #This line will drop non-numeric cols
x = data2.drop('mpg',axis=1)

# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['features'] = x.columns
vif.sort_values('VIF Factor')

Unnamed: 0,VIF Factor,features
7,7.41202,am
6,8.752581,vs
9,32.213836,carb
2,56.047781,hp
1,98.930791,disp
0,112.629828,cyl
8,119.804879,gear
3,132.214353,drat
4,182.948049,wt
5,317.534376,qsec


In [5]:
#Remove high VIF Variables (greater than 100) and Run Multivariate Regression

#Define x and y variable
x2 = data2.drop(['mpg','cyl','gear','drat','wt','qsec'],axis=1).to_numpy()
y2 = data2['mpg'].to_numpy()

#Create Train and Test Datasets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x2,y2,test_size=0.2,random_state=100)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train)
x_test2 = sc.transform(x_test)

#Model
from sklearn.linear_model import LinearRegression

In [6]:
#Create Standard Model - Removing VIF Variable

from sklearn import metrics

for name,method in [('Linear regression', LinearRegression())]: 
    method.fit(x_train2,y_train)
    predict = method.predict(x_test2)

print('\n Regression Model - using VIF information')
print('\nMethod: {}'.format(name))   

#Coefficents
print('\nIntercept: {:.2f}'.format(float(method.intercept_)))
coeff_table=pd.DataFrame(np.transpose(method.coef_),
                         data2.drop(['mpg','cyl','gear','drat','wt','qsec'],axis=1).columns,
                         columns=['Coefficients'])
print(coeff_table)
    
#R2,MAE,MSE and RMSE
print('\nR2: {:.2f}'.format(metrics.r2_score(y_test,predict)))
adjusted_r_squared = 1-(1-metrics.r2_score(y_test,predict))*(len(y2)-1)/(len(y2)-x2.shape[1]-1)
print('Adj_R2: {:0.2f}'.format(adjusted_r_squared))
print('Mean Absolute Error: {:.2f}'.format(metrics.mean_absolute_error(y_test, predict)))  
print('Mean Squared Error: {:.2f}'.format(metrics.mean_squared_error(y_test, predict)))  
print('Root Mean Squared Error: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, predict)))) 


 Regression Model - using VIF information

Method: Linear regression

Intercept: 19.87
      Coefficients
disp     -2.281568
hp       -0.136462
vs        0.713722
am        1.982224
carb     -2.017801

R2: 0.83
Adj_R2: 0.79
Mean Absolute Error: 1.78
Mean Squared Error: 4.55
Root Mean Squared Error: 2.13


### Try it
Use all the features in the dataset and check the evaluation matrix.