<a href="https://colab.research.google.com/github/yohanesnuwara/66DaysOfData/blob/main/D03_Multicollinearity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multicollinearity

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

Access Australian rain dataset.

In [None]:
!wget 'https://raw.githubusercontent.com/yohanesnuwara/datasets/master/weatherAUS.csv'

# Read CSV
df = pd.read_csv('/content/weatherAUS.csv')

df.head()

--2021-07-06 03:48:31--  https://raw.githubusercontent.com/yohanesnuwara/datasets/master/weatherAUS.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14094055 (13M) [text/plain]
Saving to: ‘weatherAUS.csv’


2021-07-06 03:48:33 (35.3 MB/s) - ‘weatherAUS.csv’ saved [14094055/14094055]



Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [None]:
# Drop few columns
category = ['WindGustDir', 'WindDir9am', 'WindDir3pm']
df = df.iloc[:,2:].drop(category, axis=1).dropna().reset_index(drop=True)

# Feature and target
X = df.iloc[:,:-2]
y = df.iloc[:,-1]

X.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6


Calculate Variance Inflation Factor (VIF) to identify which features are multicollinear. High VIF > high collinearity.

In [None]:
# Calculate Variance Inflation Factor (VIF)
vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,Column
10,425849.003879,Pressure9am
11,424035.416584,Pressure3pm
15,673.638775,Temp3pm
1,608.073689,MaxTemp
14,208.616616,Temp9am
8,60.741577,Humidity9am
0,57.81633,MinTemp
9,47.893909,Humidity3pm
5,26.320937,WindGustSpeed
4,17.288336,Sunshine


Removing multicollinearity by engineering features.

In [None]:
# Engineering features by taking difference
X['TempDiff'] = X['Temp3pm'] - X['Temp9am']
X['HumidityDiff'] = X['Humidity3pm'] - X['Humidity9am']
X['CloudDiff'] = X['Cloud3pm'] - X['Cloud9am']
X['WindSpeedDiff'] = X['WindSpeed3pm'] - X['WindSpeed9am']
X['PressureDiff'] = X['Pressure3pm'] - X['Pressure9am']

X.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,TempDiff,HumidityDiff,CloudDiff,WindSpeedDiff,PressureDiff
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,6.8,-7.0,3.0,14.0,-1.9
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,6.7,-22.0,0.0,0.0,-0.8
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,6.2,-20.0,5.0,-15.0,-3.1
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,6.5,-15.0,4.0,0.0,-3.6
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,4.0,-4.0,5.0,-4.0,-3.3


In [None]:
# Drop columns with large VIFs
X = X.drop(['Temp3pm', 'Temp9am', 'Humidity3pm', 'Humidity9am', 'Cloud3pm', 
             'Cloud9am', 'WindSpeed3pm', 'WindSpeed9am', 'Pressure3pm', 
             'Pressure9am'], axis=1)

X.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,TempDiff,HumidityDiff,CloudDiff,WindSpeedDiff,PressureDiff
0,17.9,35.2,0.0,12.0,12.3,48.0,6.8,-7.0,3.0,14.0,-1.9
1,18.4,28.9,0.0,14.8,13.0,37.0,6.7,-22.0,0.0,0.0,-0.8
2,19.4,37.6,0.0,10.8,10.6,46.0,6.2,-20.0,5.0,-15.0,-3.1
3,21.9,38.4,0.0,11.4,12.2,31.0,6.5,-15.0,4.0,0.0,-3.6
4,24.2,41.0,0.0,11.2,8.4,35.0,4.0,-4.0,5.0,-4.0,-3.3


In [None]:
# Calculate Variance Inflation Factor (VIF)
vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,Column
1,91.131365,MaxTemp
0,34.935952,MinTemp
6,14.507768,TempDiff
4,8.94055,Sunshine
5,6.924693,WindGustSpeed
3,5.858551,Evaporation
7,5.583406,HumidityDiff
10,3.786961,PressureDiff
9,1.388523,WindSpeedDiff
2,1.217792,Rainfall


Eliminate features with VIF > 7

In [None]:
list_column_vif = vif_info[vif_info.VIF<7].Column.values

# Select these features for X
X = X[list_column_vif]

X.head()

Unnamed: 0,Rainfall,Evaporation,WindGustSpeed,HumidityDiff,CloudDiff,WindSpeedDiff,PressureDiff
0,0.0,12.0,48.0,-7.0,3.0,14.0,-1.9
1,0.0,14.8,37.0,-22.0,0.0,0.0,-0.8
2,0.0,10.8,46.0,-20.0,5.0,-15.0,-3.1
3,0.0,11.4,31.0,-15.0,4.0,0.0,-3.6
4,0.0,11.2,35.0,-4.0,5.0,-4.0,-3.3


Machine learning using these features. Using SVM to predict Yes/No of 'RainTomorrow'

In [None]:
# Encode target
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
print(encoder.classes_)
print(y_encoded)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X.values, y_encoded)

# Fit with SVC model
model = SVC()
model.fit(X.values, y_encoded)
print(model.score(X_test, y_test))

['No' 'Yes']
[0 0 0 ... 0 0 0]
0.8220064724919094


References:
* https://towardsdatascience.com/how-to-remove-multicollinearity-using-python-4da8d9d8abb2