# Feature Selection Techniques

In [1]:
## Import pandas library

import pandas as pd

In [2]:
## Read the required dataset

df=pd.read_csv("C:/Users/INDIA/Downloads/day.csv")

In [3]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


## Data Cleaning

In [4]:
## Target variable
y=df['cnt']

### Remove the Target column and feature columns that does not help to predict Target variable

In [5]:
df=df.drop(['instant','dteday','casual','registered'],axis=1)

In [6]:
df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,985
1,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,801
2,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,1349
3,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,1562
4,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,1600


### Convert season value indexes to string 

In [7]:
df['season'].value_counts()

3    188
2    184
1    180
4    178
Name: season, dtype: int64

In [8]:
df['season']=df['season'].replace(1,'spring')
df['season']=df['season'].replace(2,'summer')
df['season']=df['season'].replace(3,'fall')
df['season']=df['season'].replace(4,'winter')

In [9]:
df['season'].value_counts()

fall      188
summer    184
spring    180
winter    178
Name: season, dtype: int64

### Convert Month value indexes to string

In [10]:
df['mnth'].value_counts()

12    62
10    62
8     62
7     62
5     62
3     62
1     62
11    60
9     60
6     60
4     60
2     56
Name: mnth, dtype: int64

In [11]:
def object_map(x):
    return x.map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'June',7:'July',8:'Aug',9:'Sept',10:'Oct',11:'Nov',12:'Dec'})

In [12]:
df[['mnth']]=df[['mnth']].apply(object_map)

In [13]:
df['mnth'].value_counts()

Jan     62
May     62
Mar     62
Oct     62
July    62
Dec     62
Aug     62
Nov     60
Sept    60
Apr     60
June    60
Feb     56
Name: mnth, dtype: int64

### Convert weekday values indexes to string

In [14]:
df['weekday'].value_counts()

6    105
1    105
0    105
5    104
4    104
2    104
3    103
Name: weekday, dtype: int64

In [15]:
def str_map(x):
    return x.map({0:'Tues',1:'Wed',2:'Thur',3:'Fri',4:'Sat',5:'Sun',6:'Mon'})

In [16]:
df[['weekday']]=df[['weekday']].apply(str_map)

In [17]:
df['weekday'].value_counts()

Wed     105
Tues    105
Mon     105
Thur    104
Sat     104
Sun     104
Fri     103
Name: weekday, dtype: int64

### Convert weathersit values indexes to string

In [18]:
df['weathersit'].value_counts()

1    463
2    246
3     21
Name: weathersit, dtype: int64

In [19]:
df['weathersit']=df['weathersit'].replace(1,'clear')
df['weathersit']=df['weathersit'].replace(2,'cloudy')
df['weathersit']=df['weathersit'].replace(3,'light rain')
df['weathersit']=df['weathersit'].replace(4,'heavy rain')

In [20]:
df['weathersit'].value_counts()

clear         463
cloudy        246
light rain     21
Name: weathersit, dtype: int64

## Checking for missing values

In [21]:
df.isna().sum()

season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

# Machine Learning Workflow 

### In this analysis, i would like to build a regrssion models with scikit-learn as well as statsmodels

## 1. Extract Features

In [22]:
x=df.drop('cnt',axis=1)

In [23]:
x.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,spring,0,Jan,0,Mon,0,cloudy,14.110847,18.18125,80.5833,10.749882
1,spring,0,Jan,0,Tues,0,cloudy,14.902598,17.68695,69.6087,16.652113
2,spring,0,Jan,0,Wed,1,clear,8.050924,9.47025,43.7273,16.636703
3,spring,0,Jan,0,Thur,1,clear,8.2,10.6061,59.0435,10.739832
4,spring,0,Jan,0,Fri,1,clear,9.305237,11.4635,43.6957,12.5223


In [24]:
y=df['cnt']

### 1.1 Feature and target should not have null values

In [25]:
x.isna().sum()

season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
dtype: int64

In [26]:
y.isna().sum()

0

### 1.2 Features should be numeric in nature

In [27]:
x.dtypes

season         object
yr              int64
mnth           object
holiday         int64
weekday        object
workingday      int64
weathersit     object
temp          float64
atemp         float64
hum           float64
windspeed     float64
dtype: object

Convert non-numeric values to numeric<br/>
In linear regression, i would like to apply dummy encoding to do feature selection 

In [28]:
string_columns=x.columns[x.dtypes.values=='object']

In [29]:
x=pd.get_dummies(x,columns=string_columns,drop_first=True)

In [30]:
x.dtypes

yr                         int64
holiday                    int64
workingday                 int64
temp                     float64
atemp                    float64
hum                      float64
windspeed                float64
season_spring              uint8
season_summer              uint8
season_winter              uint8
mnth_Aug                   uint8
mnth_Dec                   uint8
mnth_Feb                   uint8
mnth_Jan                   uint8
mnth_July                  uint8
mnth_June                  uint8
mnth_Mar                   uint8
mnth_May                   uint8
mnth_Nov                   uint8
mnth_Oct                   uint8
mnth_Sept                  uint8
weekday_Mon                uint8
weekday_Sat                uint8
weekday_Sun                uint8
weekday_Thur               uint8
weekday_Tues               uint8
weekday_Wed                uint8
weathersit_cloudy          uint8
weathersit_light rain      uint8
dtype: object

### 1.3 Features should be of the type array / Dataframe

In [31]:
type(x)

pandas.core.frame.DataFrame

### 1.4 Features should have some rows and columns

In [32]:
x.shape

(730, 29)

## Step2- Split the dataset into training and testing datasets

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=23)

### 2.1 Features should be on the same scale

In [35]:
x.head()

Unnamed: 0,yr,holiday,workingday,temp,atemp,hum,windspeed,season_spring,season_summer,season_winter,...,mnth_Oct,mnth_Sept,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thur,weekday_Tues,weekday_Wed,weathersit_cloudy,weathersit_light rain
0,0,0,0,14.110847,18.18125,80.5833,10.749882,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1,0,0,0,14.902598,17.68695,69.6087,16.652113,1,0,0,...,0,0,0,0,0,0,1,0,1,0
2,0,0,1,8.050924,9.47025,43.7273,16.636703,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,1,8.2,10.6061,59.0435,10.739832,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,1,9.305237,11.4635,43.6957,12.5223,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
## All the features are not in the same scale so we'll do feature scaling

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [37]:
## Convert x_train and x_test from arrays to dataframe so that the indexes will be the same for x nd y

x_train[x_train.columns]=scaler.fit_transform(x_train[x_train.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[x_train.columns]=scaler.fit_transform(x_train[x_train.columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [38]:
x_train[x_train.columns].head(2)

Unnamed: 0,yr,holiday,workingday,temp,atemp,hum,windspeed,season_spring,season_summer,season_winter,...,mnth_Oct,mnth_Sept,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thur,weekday_Tues,weekday_Wed,weathersit_cloudy,weathersit_light rain
368,1.0,0.0,1.0,0.013227,0.025139,0.426306,0.334607,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
575,1.0,0.0,1.0,0.828813,0.82947,0.687661,0.270528,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [39]:
x_test[x_test.columns]=scaler.transform(x_test[x_test.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[x_test.columns]=scaler.transform(x_test[x_test.columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [40]:
x_test[x_test.columns].head(2)

Unnamed: 0,yr,holiday,workingday,temp,atemp,hum,windspeed,season_spring,season_summer,season_winter,...,mnth_Oct,mnth_Sept,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thur,weekday_Tues,weekday_Wed,weathersit_cloudy,weathersit_light rain
476,1.0,0.0,0.0,0.391581,0.409305,0.859041,0.664138,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
662,1.0,0.0,1.0,0.592206,0.608641,0.823051,0.210244,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Step-3 Train the model on the training dataset

In [41]:
## Start with RFE [Recursive Feature Elimintion]
## i will give random number of 15 and see how the model perform

In [42]:
## import RFE library

from sklearn.feature_selection import RFE

In [43]:
## RFE works on a model 
## Use linear regression model 

from sklearn.linear_model import LinearRegression

In [44]:
lm=LinearRegression()
rfe=RFE(lm,15)
rfe.fit(x_train,y_train)



RFE(estimator=LinearRegression(), n_features_to_select=15)

In [45]:
## To know which of the columns that have included in the top 15

rfe.support_

array([ True,  True, False,  True,  True,  True,  True,  True, False,
        True, False,  True, False, False,  True, False, False, False,
        True, False,  True,  True, False, False, False, False, False,
        True,  True])

In [46]:
## import statsmodels library

In [47]:
import statsmodels.api as sm

In [48]:
col=x_train.columns[rfe.support_]  ## Gives the column names

In [49]:
col

Index(['yr', 'holiday', 'temp', 'atemp', 'hum', 'windspeed', 'season_spring',
       'season_winter', 'mnth_Dec', 'mnth_July', 'mnth_Nov', 'mnth_Sept',
       'weekday_Mon', 'weathersit_cloudy', 'weathersit_light rain'],
      dtype='object')

In [50]:
x_train_rfe=x_train[col]  ## fetch only those columns from x_train

In [51]:
x_train_rfe.head()

Unnamed: 0,yr,holiday,temp,atemp,hum,windspeed,season_spring,season_winter,mnth_Dec,mnth_July,mnth_Nov,mnth_Sept,weekday_Mon,weathersit_cloudy,weathersit_light rain
368,1.0,0.0,0.013227,0.025139,0.426306,0.334607,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
575,1.0,0.0,0.828813,0.82947,0.687661,0.270528,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
604,1.0,0.0,0.825542,0.802542,0.637532,0.34744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114,0.0,0.0,0.666351,0.672362,0.798629,0.335912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
663,1.0,0.0,0.586754,0.598776,0.829905,0.226926,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [52]:
## I have to give intercept in statsmodel so add constant value 

x_train_rfe=sm.add_constant(x_train_rfe)

In [53]:
x_train_rfe.head()

Unnamed: 0,const,yr,holiday,temp,atemp,hum,windspeed,season_spring,season_winter,mnth_Dec,mnth_July,mnth_Nov,mnth_Sept,weekday_Mon,weathersit_cloudy,weathersit_light rain
368,1.0,1.0,0.0,0.013227,0.025139,0.426306,0.334607,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
575,1.0,1.0,0.0,0.828813,0.82947,0.687661,0.270528,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
604,1.0,1.0,0.0,0.825542,0.802542,0.637532,0.34744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114,1.0,0.0,0.0,0.666351,0.672362,0.798629,0.335912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
663,1.0,1.0,0.0,0.586754,0.598776,0.829905,0.226926,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [54]:
## Perform Linear Regression

lm=sm.OLS(y_train,x_train_rfe).fit()
lm.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.852
Model:,OLS,Adj. R-squared:,0.847
Method:,Least Squares,F-statistic:,189.3
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,6.9e-194
Time:,10:30:40,Log-Likelihood:,-4117.9
No. Observations:,511,AIC:,8268.0
Df Residuals:,495,BIC:,8336.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3358.9564,282.677,11.883,0.000,2803.563,3914.350
yr,2014.7603,70.505,28.576,0.000,1876.234,2153.287
holiday,-867.9570,229.073,-3.789,0.000,-1318.032,-417.882
temp,-1883.1971,1947.001,-0.967,0.334,-5708.602,1942.207
atemp,6043.3740,2008.999,3.008,0.003,2096.156,9990.592
hum,-1987.3596,336.921,-5.899,0.000,-2649.332,-1325.387
windspeed,-1420.6034,240.042,-5.918,0.000,-1892.230,-948.977
season_spring,-985.5302,133.006,-7.410,0.000,-1246.857,-724.204
season_winter,722.3095,115.348,6.262,0.000,495.678,948.941

0,1,2,3
Omnibus:,84.767,Durbin-Watson:,2.07
Prob(Omnibus):,0.0,Jarque-Bera (JB):,207.595
Skew:,-0.849,Prob(JB):,8.339999999999999e-46
Kurtosis:,5.621,Cond. No.,133.0


In [55]:
## Drop the constant from x_train_rfe because it is not a feature, we added it

a=x_train_rfe.drop('const',axis=1)

In [56]:
a.columns

Index(['yr', 'holiday', 'temp', 'atemp', 'hum', 'windspeed', 'season_spring',
       'season_winter', 'mnth_Dec', 'mnth_July', 'mnth_Nov', 'mnth_Sept',
       'weekday_Mon', 'weathersit_cloudy', 'weathersit_light rain'],
      dtype='object')

# VIF (Variance Inflation Factor)

In [57]:
## import VIF library from statsmodels

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [58]:
## Calculate VIF

## I would like to use a list comprehension just to make a code concise

In [59]:
vif=pd.DataFrame()
vif["Features"]=a.columns
vif["VIF"]=[ variance_inflation_factor(a.values,i) for i in range(a.shape[1])]
vif["VIF"]= round(vif["VIF"],2)
vif.sort_values(by = "VIF",ascending=False)

Unnamed: 0,Features,VIF
3,atemp,1087.13
2,temp,1022.93
4,hum,25.64
5,windspeed,4.74
7,season_winter,2.87
6,season_spring,2.74
13,weathersit_cloudy,2.47
0,yr,2.05
10,mnth_Nov,1.79
9,mnth_July,1.42


In [60]:
## Drop the column that is havong both bad p-Value and bad VIF value

a.drop('temp',axis=1,inplace=True)
col=a.columns

In [61]:
x_train_rfe=x_train[col]
x_train_rfe=sm.add_constant(x_train_rfe)
lm=sm.OLS(y_train,x_train_rfe).fit()
lm.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.851
Model:,OLS,Adj. R-squared:,0.847
Method:,Least Squares,F-statistic:,202.8
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,7.570000000000001e-195
Time:,10:30:41,Log-Likelihood:,-4118.4
No. Observations:,511,AIC:,8267.0
Df Residuals:,496,BIC:,8330.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3401.9633,279.139,12.187,0.000,2853.522,3950.405
yr,2015.7872,70.493,28.596,0.000,1877.286,2154.288
holiday,-874.9112,228.945,-3.821,0.000,-1324.733,-425.089
atemp,4118.1201,272.120,15.133,0.000,3583.471,4652.769
hum,-1964.5352,336.072,-5.846,0.000,-2624.836,-1304.235
windspeed,-1473.3277,233.755,-6.303,0.000,-1932.600,-1014.055
season_spring,-980.3799,132.891,-7.377,0.000,-1241.479,-719.281
season_winter,742.3217,113.469,6.542,0.000,519.382,965.262
mnth_Dec,-462.2108,140.731,-3.284,0.001,-738.713,-185.709

0,1,2,3
Omnibus:,82.635,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,202.074
Skew:,-0.829,Prob(JB):,1.32e-44
Kurtosis:,5.596,Cond. No.,19.4


In [62]:
vif=pd.DataFrame()
vif["Features"]=a.columns
vif["VIF"]=[ variance_inflation_factor(a.values,i) for i in range(a.shape[1])]
vif["VIF"]= round(vif["VIF"],2)
vif.sort_values(by = "VIF",ascending=False)

Unnamed: 0,Features,VIF
3,hum,24.43
2,atemp,16.24
4,windspeed,4.61
6,season_winter,2.74
5,season_spring,2.7
12,weathersit_cloudy,2.44
0,yr,2.05
9,mnth_Nov,1.78
7,mnth_Dec,1.39
8,mnth_July,1.39


In [63]:
a.drop("hum",axis=1,inplace=True)
col=a.columns

In [64]:
x_train_rfe=x_train[col]
x_train_rfe=sm.add_constant(x_train_rfe)
lm=sm.OLS(y_train,x_train_rfe).fit()
lm.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.841
Model:,OLS,Adj. R-squared:,0.837
Method:,Least Squares,F-statistic:,202.3
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,7.269999999999999e-189
Time:,10:30:41,Log-Likelihood:,-4135.4
No. Observations:,511,AIC:,8299.0
Df Residuals:,497,BIC:,8358.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2351.2692,220.581,10.659,0.000,1917.883,2784.656
yr,2085.1362,71.769,29.054,0.000,1944.129,2226.143
holiday,-942.8654,236.157,-3.993,0.000,-1406.854,-478.877
atemp,3714.8573,271.872,13.664,0.000,3180.697,4249.018
windspeed,-1093.6432,231.922,-4.716,0.000,-1549.312,-637.974
season_spring,-1050.4138,136.695,-7.684,0.000,-1318.985,-781.842
season_winter,658.1912,116.248,5.662,0.000,429.792,886.590
mnth_Dec,-513.6262,145.067,-3.541,0.000,-798.646,-228.606
mnth_July,-607.9282,151.066,-4.024,0.000,-904.735,-311.121

0,1,2,3
Omnibus:,86.67,Durbin-Watson:,2.121
Prob(Omnibus):,0.0,Jarque-Bera (JB):,216.718
Skew:,-0.86,Prob(JB):,8.72e-48
Kurtosis:,5.687,Cond. No.,14.9


In [65]:
vif=pd.DataFrame()
vif["Features"]=a.columns
vif["VIF"]=[ variance_inflation_factor(a.values,i) for i in range(a.shape[1])]
vif["VIF"]= round(vif["VIF"],2)
vif.sort_values(by = "VIF",ascending=False)

Unnamed: 0,Features,VIF
2,atemp,5.07
3,windspeed,4.54
5,season_winter,2.39
0,yr,2.04
4,season_spring,1.77
8,mnth_Nov,1.74
11,weathersit_cloudy,1.54
7,mnth_July,1.35
6,mnth_Dec,1.34
9,mnth_Sept,1.2


In [66]:
a.drop("atemp",axis=1,inplace=True)
col=a.columns

In [67]:
x_train_rfe=x_train[col]
x_train_rfe=sm.add_constant(x_train_rfe)
lm=sm.OLS(y_train,x_train_rfe).fit()
lm.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.781
Model:,OLS,Adj. R-squared:,0.776
Method:,Least Squares,F-statistic:,148.3
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,1.03e-155
Time:,10:30:42,Log-Likelihood:,-4216.9
No. Observations:,511,AIC:,8460.0
Df Residuals:,498,BIC:,8515.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4946.9590,131.362,37.659,0.000,4688.867,5205.051
yr,2165.7646,83.807,25.842,0.000,2001.105,2330.424
holiday,-919.6176,276.700,-3.324,0.001,-1463.260,-375.975
windspeed,-1596.6808,268.300,-5.951,0.000,-2123.820,-1069.542
season_spring,-2387.6081,111.826,-21.351,0.000,-2607.316,-2167.900
season_winter,169.1291,129.592,1.305,0.192,-85.486,423.744
mnth_Dec,-951.4626,165.778,-5.739,0.000,-1277.173,-625.753
mnth_July,90.9438,166.551,0.546,0.585,-236.286,418.174
mnth_Nov,-928.3182,179.931,-5.159,0.000,-1281.835,-574.801

0,1,2,3
Omnibus:,47.962,Durbin-Watson:,2.062
Prob(Omnibus):,0.0,Jarque-Bera (JB):,112.0
Skew:,-0.503,Prob(JB):,4.78e-25
Kurtosis:,5.061,Cond. No.,10.9


In [68]:
vif=pd.DataFrame()
vif["Features"]=a.columns
vif["VIF"]=[ variance_inflation_factor(a.values,i) for i in range(a.shape[1])]
vif["VIF"]= round(vif["VIF"],2)
vif.sort_values(by = "VIF",ascending=False)

Unnamed: 0,Features,VIF
2,windspeed,2.68
4,season_winter,2.37
0,yr,1.79
7,mnth_Nov,1.72
3,season_spring,1.59
10,weathersit_cloudy,1.48
5,mnth_Dec,1.33
9,weekday_Mon,1.16
8,mnth_Sept,1.13
6,mnth_July,1.11


In [69]:
best_features=vif["Features"]

In [70]:
best_features.values

array(['yr', 'holiday', 'windspeed', 'season_spring', 'season_winter',
       'mnth_Dec', 'mnth_July', 'mnth_Nov', 'mnth_Sept', 'weekday_Mon',
       'weathersit_cloudy', 'weathersit_light rain'], dtype=object)

## "best_features.values" are the best columns that we have selected using feature selection techniques<br/>
## These are the best features that will help to predict our Target