In [61]:
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """
    Source: https://planspace.org/20150423-forward_selection_with_statsmodels/
    Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [88]:
import pandas as pd

# url = "http://data.princeton.edu/wws509/datasets/salary.dat"
# data = pd.read_csv(url, sep='\\s+')
data = pd.read_csv("./data/cleaned.csv")

In [78]:
data1 = data.copy()

In [71]:
model = forward_selected(data1, 'price')

print (model.model.formula)
# sl ~ rk + yr + 1

print (model.rsquared_adj)
# 0.835190760538

price ~ sqft_living + price_per_sqft + yr_built + waterfront + grade + floors + sqft_above + lat + zipcode + sqft_lot + distance_from_downtown_mile + sqft_living15 + bedrooms + condition + long + is_renovated + sqft_lot15 + total_sqft_larger_than_neighbours + bathrooms + sqft_basement + 1
0.7826373522029644


In [68]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.783
Model:,OLS,Adj. R-squared:,0.783
Method:,Least Squares,F-statistic:,3857.0
Date:,"Fri, 16 Apr 2021",Prob (F-statistic):,0.0
Time:,13:06:53,Log-Likelihood:,-288540.0
No. Observations:,21419,AIC:,577100.0
Df Residuals:,21398,BIC:,577300.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.932e+07,2.95e+06,23.536,0.000,6.35e+07,7.51e+07
sqft_living,101.8966,15.432,6.603,0.000,71.649,132.144
price_per_sqft,3162.5309,40.883,77.355,0.000,3082.396,3242.665
yr_built,-1863.1806,63.164,-29.498,0.000,-1986.986,-1739.375
waterfront,7.359e+05,1.45e+04,50.767,0.000,7.08e+05,7.64e+05
grade,5.284e+04,1915.485,27.583,0.000,4.91e+04,5.66e+04
floors,-1.391e+05,3494.239,-39.822,0.000,-1.46e+05,-1.32e+05
sqft_above,146.6458,15.418,9.511,0.000,116.426,176.866
lat,1.575e+05,1.18e+04,13.375,0.000,1.34e+05,1.81e+05

0,1,2,3
Omnibus:,19974.197,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3032632.241
Skew:,4.028,Prob(JB):,0.0
Kurtosis:,60.733,Cond. No.,253000000.0


In [89]:
X = data1.drop(columns='price').copy()
y = data1['price'].copy()

In [90]:
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15,distance_from_downtown_mile,total_sqft_larger_than_neighbours,price_per_sqft,is_renovated
0,3,1.00,1180,5650,1.0,0,3,7,1180,0,1955,98178,47.5112,-122.257,1340,5650,7.43,0,32.49,0
1,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,98125,47.7210,-122.319,1690,7639,7.95,1,54.83,1
2,2,1.00,770,10000,1.0,0,3,6,770,0,1933,98028,47.7379,-122.233,2720,8062,10.19,0,16.71,0
3,4,3.00,1960,5000,1.0,0,5,7,1050,910,1965,98136,47.5208,-122.393,1360,5000,6.54,1,86.78,0
4,3,2.00,1680,8080,1.0,0,3,8,1680,0,1987,98074,47.6168,-122.045,1800,7503,13.38,1,52.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21414,3,2.50,1530,1131,3.0,0,3,8,1530,0,2009,98103,47.6993,-122.346,1530,1509,6.46,0,135.29,0
21415,4,2.50,2310,5813,2.0,0,3,8,2310,0,2014,98146,47.5107,-122.362,1830,7200,6.74,0,49.24,0
21416,2,0.75,1020,1350,2.0,0,3,7,1020,0,2009,98144,47.5944,-122.299,1020,2007,1.74,0,169.66,0
21417,3,2.50,1600,2388,2.0,0,3,8,1600,0,2004,98027,47.5345,-122.069,1410,1287,13.22,1,100.30,0


In [91]:
import statsmodels as sm

In [92]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select = 7)
selector = selector.fit(X, y.values.ravel()) # convert y to 1d np array to prevent DataConversionWarning
selector.support_ 

array([False,  True, False, False, False,  True,  True,  True, False,
       False, False, False,  True,  True, False, False, False, False,
       False,  True])

In [93]:
list(zip(data1.columns.to_list(), selector.support_.tolist()))

[('price', False),
 ('bedrooms', True),
 ('bathrooms', False),
 ('sqft_living', False),
 ('sqft_lot', False),
 ('floors', True),
 ('waterfront', True),
 ('condition', True),
 ('grade', False),
 ('sqft_above', False),
 ('sqft_basement', False),
 ('yr_built', False),
 ('zipcode', True),
 ('lat', True),
 ('long', False),
 ('sqft_living15', False),
 ('sqft_lot15', False),
 ('distance_from_downtown_mile', False),
 ('total_sqft_larger_than_neighbours', False),
 ('price_per_sqft', True)]

In [94]:
df__ = pd.DataFrame(list(zip(data1.columns.to_list(), selector.support_.tolist())),columns=['Feature','keep'])

In [48]:
df__.sort_values(by="keep",ascending=False)

Unnamed: 0,Feature,keep
19,price_per_sqft,True
12,zipcode,True
5,floors,True
6,waterfront,True
7,condition,True
1,bedrooms,True
13,lat,True
18,total_sqft_larger_than_neighbours,False
17,distance_from_downtown_mile,False
16,sqft_lot15,False
