# Handling multicollinearity with VIF

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
data = pd.read_csv('./Concrete_Data.csv')

In [3]:
data.head(10)

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,365,43.7
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cement            1030 non-null   float64
 1   slag              1030 non-null   float64
 2   flyash            1030 non-null   float64
 3   water             1030 non-null   float64
 4   superplasticizer  1030 non-null   float64
 5   coarseaggregate   1030 non-null   float64
 6   fineaggregate     1030 non-null   float64
 7   age               1030 non-null   int64  
 8   csMPa             1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [5]:
data.describe()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [6]:
data.isnull().mean()*100

cement              0.0
slag                0.0
flyash              0.0
water               0.0
superplasticizer    0.0
coarseaggregate     0.0
fineaggregate       0.0
age                 0.0
csMPa               0.0
dtype: float64

In [7]:
data.duplicated().sum()

25

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.columns

Index(['cement', 'slag', 'flyash', 'water', 'superplasticizer',
       'coarseaggregate', 'fineaggregate', 'age', 'csMPa'],
      dtype='object')

In [10]:
data.shape

(1005, 9)

#### Extract the independent variables/features into a separate DataFrame

In [11]:
x = data[['cement', 'slag', 'flyash', 'water', 'superplasticizer',
       'coarseaggregate', 'fineaggregate', 'age']]

#### Add a constant to the independent variables DataFrame

In [12]:
x = sm.add_constant(x)

In [13]:
x

Unnamed: 0,const,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age
0,1.0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,1.0,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,1.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,1.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,1.0,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360
...,...,...,...,...,...,...,...,...,...
1025,1.0,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28
1026,1.0,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28
1027,1.0,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28
1028,1.0,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28


#### Calculate VIF

In [14]:
vif = pd.DataFrame()

In [15]:
vif["Variable"] = x.columns

In [16]:
vif

Unnamed: 0,Variable
0,const
1,cement
2,slag
3,flyash
4,water
5,superplasticizer
6,coarseaggregate
7,fineaggregate
8,age


In [17]:
vif["VIF"] = [sm.OLS(x[col], x.drop(col, axis=1)).fit().rsquared for col in x.columns]

In [18]:
vif

Unnamed: 0,Variable,VIF
0,const,0.999849
1,cement,0.865754
2,slag,0.862321
3,flyash,0.835649
4,water,0.853769
5,superplasticizer,0.651244
6,coarseaggregate,0.798233
7,fineaggregate,0.856575
8,age,0.107233


<li> The code above calculates VIF for each independent variable by fitting a simple linear regression model for each variable while excluding it and checking the coefficient of determination (the r-squared value). Higher VIF values indicate higher collinearity.