# P-value for feature reduction

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.datasets import fetch_california_housing

## Fitting OLS model

In [2]:
x = fetch_california_housing() # Load the California Housing Dataset
y = x.target # price
X = pd.DataFrame(x.data, columns = x.feature_names)

X_added_constant = sm.add_constant(X) # we need to add this constant for the intercept w/ OLS
X_added_constant # input features for linear regression use OLS

Unnamed: 0,const,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,1.0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,1.0,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...,...
20635,1.0,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,1.0,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.0,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.0,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [3]:
model = sm.OLS(y,X_added_constant).fit() # ordinary least squares regression model from statsmodels
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,3970.0
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,0.0
Time:,15:20:18,Log-Likelihood:,-22624.0
No. Observations:,20640,AIC:,45270.0
Df Residuals:,20631,BIC:,45340.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-36.9419,0.659,-56.067,0.000,-38.233,-35.650
MedInc,0.4367,0.004,104.054,0.000,0.428,0.445
HouseAge,0.0094,0.000,21.143,0.000,0.009,0.010
AveRooms,-0.1073,0.006,-18.235,0.000,-0.119,-0.096
AveBedrms,0.6451,0.028,22.928,0.000,0.590,0.700
Population,-3.976e-06,4.75e-06,-0.837,0.402,-1.33e-05,5.33e-06
AveOccup,-0.0038,0.000,-7.769,0.000,-0.005,-0.003
Latitude,-0.4213,0.007,-58.541,0.000,-0.435,-0.407
Longitude,-0.4345,0.008,-57.682,0.000,-0.449,-0.420

0,1,2,3
Omnibus:,4393.65,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.596
Skew:,1.082,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,238000.0


## Dropping Insignificant Features using p-value thresholding

In practice, p-value is used for feature selection in the following way: we set the significance level alpha to 0.05, so that when p<0.05, we reject the null hypothesis and consider the feature to be important. p-value in this context corresponds to the probability that a particular feature doesn't affect the model (null hypothesis is true).

In [4]:
alpha                = 0.05 # significance threshold
sig_param_names      = model.params[list(np.where(model.pvalues < alpha)[0])].iloc[0:].index.tolist() # list of names of signficant parameters
significant_features = X_added_constant[sig_param_names] # only select vars having p-value < 0.05
significant_features # print out the significant feature values

Unnamed: 0,const,MedInc,HouseAge,AveRooms,AveBedrms,AveOccup,Latitude,Longitude
0,1.0,8.3252,41.0,6.984127,1.023810,2.555556,37.88,-122.23
1,1.0,8.3014,21.0,6.238137,0.971880,2.109842,37.86,-122.22
2,1.0,7.2574,52.0,8.288136,1.073446,2.802260,37.85,-122.24
3,1.0,5.6431,52.0,5.817352,1.073059,2.547945,37.85,-122.25
4,1.0,3.8462,52.0,6.281853,1.081081,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.0,1.5603,25.0,5.045455,1.133333,2.560606,39.48,-121.09
20636,1.0,2.5568,18.0,6.114035,1.315789,3.122807,39.49,-121.21
20637,1.0,1.7000,17.0,5.205543,1.120092,2.325635,39.43,-121.22
20638,1.0,1.8672,18.0,5.329513,1.171920,2.123209,39.43,-121.32


In [5]:
# rerun the model with only the features having p<0.05
model            = sm.OLS(y,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,3970.0
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,0.0
Time:,15:20:18,Log-Likelihood:,-22624.0
No. Observations:,20640,AIC:,45270.0
Df Residuals:,20631,BIC:,45340.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-36.9419,0.659,-56.067,0.000,-38.233,-35.650
MedInc,0.4367,0.004,104.054,0.000,0.428,0.445
HouseAge,0.0094,0.000,21.143,0.000,0.009,0.010
AveRooms,-0.1073,0.006,-18.235,0.000,-0.119,-0.096
AveBedrms,0.6451,0.028,22.928,0.000,0.590,0.700
Population,-3.976e-06,4.75e-06,-0.837,0.402,-1.33e-05,5.33e-06
AveOccup,-0.0038,0.000,-7.769,0.000,-0.005,-0.003
Latitude,-0.4213,0.007,-58.541,0.000,-0.435,-0.407
Longitude,-0.4345,0.008,-57.682,0.000,-0.449,-0.420

0,1,2,3
Omnibus:,4393.65,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.596
Skew:,1.082,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,238000.0


<b> We see that after dropping one insignificant variable, the adjusted R-squared improved slightly (from 0.734
to 0.735)