<p><i><h2> Variance Inflation Factor (VIF) Calculation </h2></i></P>

In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# loading data
data = pd.read_csv(r"C:\Users\venka\OneDrive\Documents\PROJECTS\bank_dataset.csv")
data.iloc[0:5, :]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
# Dropping the Target Column
x = data.drop(columns=['Target'])
y = data['Target']

In [4]:
# Seperating the Numerical Columns and Categorical Columns
df_c = x.select_dtypes(include = "object")
df_c

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,management,married,tertiary,no,yes,no,unknown,may,unknown
1,technician,single,secondary,no,yes,no,unknown,may,unknown
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown
4,unknown,single,unknown,no,no,no,unknown,may,unknown
...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,unknown
45207,retired,divorced,primary,no,no,no,cellular,nov,unknown
45208,retired,married,secondary,no,no,no,cellular,nov,success
45209,blue-collar,married,secondary,no,no,no,telephone,nov,unknown


In [5]:
df_n = x.select_dtypes(include = "number")
df_n

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [6]:
# Convert any categorical variables into dummy variables:
df = pd.get_dummies(df_c, drop_first=True)
df

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
45207,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
45208,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
45209,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [7]:
result = pd.concat([df, df_n], axis = 1)
result

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,...,poutcome_other,poutcome_success,poutcome_unknown,age,balance,day,duration,campaign,pdays,previous
0,0,0,0,1,0,0,0,0,0,0,...,0,0,1,58,2143,5,261,1,-1,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,1,44,29,5,151,1,-1,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,1,33,2,5,76,1,-1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,47,1506,5,92,1,-1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0,0,0,0,0,0,0,0,1,0,...,0,0,1,51,825,17,977,3,-1,0
45207,0,0,0,0,1,0,0,0,0,0,...,0,0,1,71,1729,17,456,2,-1,0
45208,0,0,0,0,1,0,0,0,0,0,...,0,1,0,72,5715,17,1127,5,184,3
45209,1,0,0,0,0,0,0,0,0,0,...,0,0,1,57,668,17,508,4,-1,0


In [8]:
vif = pd.DataFrame() # take empty data frame 

In [9]:
vif["Variable"] = result.columns # assign column names to the column "Variable"

In [10]:
# calculate VIF and assign values to the variable "VIF"
vif["VIF"] = [variance_inflation_factor(result.values, i) for i in range(result.shape[1])] 

In [11]:
vif

Unnamed: 0,Variable,VIF
0,job_blue-collar,2.974646
1,job_entrepreneur,1.334005
2,job_housemaid,1.297748
3,job_management,3.738496
4,job_retired,1.794999
5,job_self-employed,1.353591
6,job_services,1.754023
7,job_student,1.235343
8,job_technician,2.459455
9,job_unemployed,1.263647


The Variance Inflation Factor (VIF) measures the extent to which the variance of an estimated regression coefficient is increased due to multicollinearity. Higher VIF values suggest a higher degree of multicollinearity, which can make it challenging to interpret the effects of individual variables in a regression model.

Here are some inferences based on the provided VIF values:

Marital Status (marital_married): The VIF value of 5.824 suggests that there is a significant multicollinearity issue with the 'marital_married' variable. This high VIF indicates that this variable is highly correlated with other variables in the dataset.

Month of Contact (month_may): The VIF value of 6.255 suggests a significant multicollinearity issue with the 'month_may' variable. This high VIF indicates that the 'month_may' variable is highly correlated with other month-related variables.

Age: The VIF value of 19.406 indicates a very high level of multicollinearity with the 'age' variable. This suggests that 'age' is highly correlated with other independent variables in the dataset.

Default (default_yes): The VIF value of 1.034 suggests a low level of multicollinearity with the 'default_yes' variable. This indicates that 'default_yes' is not highly correlated with other variables in the dataset.

Previous Outcome (poutcome_unknown): The VIF value of 23.655 indicates a very high level of multicollinearity with the 'poutcome_unknown' variable. This suggests that 'poutcome_unknown' is highly correlated with other variables in the dataset.

Balance: The VIF value of 1.264 suggests a relatively low level of multicollinearity with the 'balance' variable.

Day: The VIF value of 5.899 suggests a significant multicollinearity issue with the 'day' variable. This high VIF indicates that 'day' is highly correlated with other variables in the dataset.

In general, variables with high VIF values (greater than 5-10) indicate a potential multicollinearity problem. It may be necessary to consider removing some of these highly correlated variables or applying dimensionality reduction techniques to address multicollinearity and improve the stability and interpretability of your regression model.

In [13]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data_path):
    # Loading the data from specified path
    data =pd.read_csv(data_path)

    # Drop the 'Target' Column
    x = data.drop(columns=["Target"])
    y = data["Target"]

    # Seperate the Numerical and Categorical Culumns
    df_c = x.select_dtypes(include = "object")
    df_n = x.select_dtypes(include = "number")

    # convert categorical variables into dummy variables
    df = pd.get_dummies(df_c, drop_first = True)

    # Concatinate Numerical and Dummy varibles
    result = pd.concat([df, df_n], axis = 1)

    #Initilaize a dataframe to store VIF values
    vif = pd.DataFrame()
    vif["Variable"] = result.columns

    #Calculate VIF and assign values to the "VIF" column
    vif["VIF"] = [variance_inflation_factor(result.values, i) for i in range(result.shape[1])]
    
    return vif

if __name__ == "__main__":
    data_path = r"C:\Users\venka\OneDrive\Documents\PROJECTS\bank_dataset.csv"
    vif_result = calculate_vif(data_path)
    print("VIF Values: ")
    print(vif_result)

VIF Values: 
               Variable        VIF
0       job_blue-collar   2.974646
1      job_entrepreneur   1.334005
2         job_housemaid   1.297748
3        job_management   3.738496
4           job_retired   1.794999
5     job_self-employed   1.353591
6          job_services   1.754023
7           job_student   1.235343
8        job_technician   2.459455
9        job_unemployed   1.263647
10          job_unknown   1.097503
11      marital_married   5.824164
12       marital_single   3.435989
13  education_secondary   4.767843
14   education_tertiary   4.433690
15    education_unknown   1.337650
16          default_yes   1.033826
17          housing_yes   3.126835
18             loan_yes   1.261312
19    contact_telephone   1.169101
20      contact_unknown   3.400850
21            month_aug   3.319335
22            month_dec   1.077775
23            month_feb   1.882490
24            month_jan   1.511183
25            month_jul   3.368762
26            month_jun   3.615882
27     