In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, RidgeCV, LassoCV, Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

### Load the data

In [2]:
df_train = pd.read_csv('../datasets/train.csv')
df_test = pd.read_csv('../datasets/test.csv')

In [3]:
df_train.columns = [col.lower().replace(" ", '_') for col in df_train.columns]
df_test.columns = [col.lower().replace(' ','_') for col in df_test.columns]

In [4]:
df_train = df_train._get_numeric_data()

In [5]:
df_test = df_test._get_numeric_data()

In [6]:
# alt: df_train.select_dtypes

### Data cleaning : origin

In [7]:
df_train.shape, df_test.shape

((2051, 39), (878, 38))

In [8]:
set(df_train.columns) - set(df_test.columns)

{'saleprice'}

In [9]:
# check train nulls
df_train.isnull().sum().sort_values(ascending = False).loc[lambda x: x>0]

lot_frontage      330
garage_yr_blt     114
mas_vnr_area       22
bsmt_half_bath      2
bsmt_full_bath      2
bsmtfin_sf_1        1
garage_cars         1
garage_area         1
total_bsmt_sf       1
bsmt_unf_sf         1
bsmtfin_sf_2        1
dtype: int64

In [10]:
# check test nulls

df_test.isnull().sum().sort_values(ascending = False).loc[lambda x: x>0]

lot_frontage     160
garage_yr_blt     45
mas_vnr_area       1
dtype: int64

In [11]:
#drop all nulls from train
df_train.dropna(axis = 1, inplace = True)

In [12]:
#drop same columns
df_test = df_test[[col for col in df_train.columns if col != 'saleprice']].copy()

In [13]:
df_train.shape, df_test.shape

((2051, 28), (878, 27))

#### Preprocessing the terrible threes

In [14]:
df_train.head()

Unnamed: 0,id,pid,ms_subclass,lot_area,overall_qual,overall_cond,year_built,year_remod/add,1st_flr_sf,2nd_flr_sf,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,saleprice
0,109,533352170,60,13517,6,8,1976,2005,725,754,...,0,44,0,0,0,0,0,3,2010,130500
1,544,531379050,60,11492,7,5,1996,1997,913,1209,...,0,74,0,0,0,0,0,4,2009,220000
2,153,535304180,20,7922,5,7,1953,2007,1057,0,...,0,52,0,0,0,0,0,1,2010,109000
3,318,916386060,60,9802,5,5,2006,2007,744,700,...,100,0,0,0,0,0,0,4,2010,174000
4,255,906425045,50,14235,6,8,1900,1993,831,614,...,0,59,0,0,0,0,0,3,2010,138500


In [15]:
# set up x and y
X = df_train.drop(columns = ['pid','saleprice'])
y = df_train['saleprice']

In [16]:
df_test.drop(columns = 'pid', inplace = True)

In [17]:
# split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size = .2,
    random_state = 13
)

In [18]:
# check shapes of train/val/test

In [19]:
X_train.shape, X_val.shape, df_test.shape

((1640, 26), (411, 26), (878, 26))

### Feature Engineering

In [20]:
#instantiate polyfeat to simulate our engineering process

In [21]:
#instantiate polyfeat to simulate our engineering process

poly = PolynomialFeatures(degree = 3,
                          interaction_only = False,
                          include_bias = False
                         )

In [22]:
#stash away id
tr_id = X_train['id']
val_id = X_val['id']

In [23]:
X_train = poly.fit_transform(
    X_train.drop(columns = 'id')
)
X_train = pd.DataFrame(X_train, columns = poly.get_feature_names_out())

X_val = poly.transform(X_val.drop(columns = 'id'))
X_val = pd.DataFrame(X_val, columns = poly.get_feature_names_out())

### More Preprocessing!

In [24]:
ss = StandardScaler()

In [25]:
Xs_train = ss.fit_transform(X_train)
Xs_train = pd.DataFrame(Xs_train, columns = ss.get_feature_names_out())
Xs_val = pd.DataFrame(ss.transform(X_val), columns = ss.get_feature_names_out())

In [26]:
lr = LinearRegression()

#fit to scaled training data
lr.fit(Xs_train, y_train)

In [27]:
cross_val_score(lr, Xs_train, y_train).mean()

-7.878772546484077e+16

In [28]:
#score it twice!

lr.score(Xs_train, y_train), lr.score(Xs_val,y_val)

(0.99999917729904, -6.12217708493759e+23)

In [29]:
mean_squared_error(y_train, lr.predict(Xs_train))**.5

71.992547324696

In [30]:
mean_squared_error(y_val, lr.predict(Xs_val))**.5

6.157470937400543e+16

### How to combat overfitting
1. More data
2. Feature selection (reduce complexity)
3. Regularization

In [31]:
ridge = RidgeCV(alphas = np.logspace(0,5,100))

#fit to scaled training data
ridge.fit(Xs_train, y_train)

In [32]:
#what's our best alpha?
ridge.alpha_

4862.601580065353

In [33]:
#score it twice!
ridge.score(Xs_train, y_train)

0.8993853316820469

In [34]:
ridge.score(Xs_val, y_val)

0.6579512153779864

In [35]:
mean_squared_error(y_train, ridge.predict(Xs_train))**.5

25176.606286945058

In [36]:
mean_squared_error(y_val, ridge.predict(Xs_val))**.5

46024.96350008821

### Lasso time!

In [37]:
import warnings

In [38]:
#instantiate it!
lasso = LassoCV(alphas = np.arange(.001,20,1))

#fit it!
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lasso.fit(Xs_train, y_train)


In [39]:
#what's our best alpha?
lasso.alpha_

17.000999999999998

In [40]:
#score it twice
lasso.score(Xs_train, y_train)

0.9541108150646257

In [41]:
lasso.score(Xs_val, y_val)

-1.5098785458100172

In [42]:
mean_squared_error(y_train,lasso.predict(Xs_train))**.5

17002.849492571037

In [43]:
mean_squared_error(y_val,lasso.predict(Xs_val))**.5

124673.96745408075

In [44]:
#take the time to look at coefficients, 
#see which ones were zeroed out

### at last: prediction time

In [45]:
df_test.head()

Unnamed: 0,id,ms_subclass,lot_area,overall_qual,overall_cond,year_built,year_remod/add,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,...,fireplaces,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold
0,2658,190,9142,6,8,1910,1950,908,1020,0,...,0,0,60,112,0,0,0,0,4,2006
1,2718,90,9662,5,4,1977,1977,1967,0,0,...,0,170,0,0,0,0,0,0,8,2006
2,2414,60,17104,7,5,2006,2006,664,832,0,...,1,100,24,0,0,0,0,0,9,2006
3,1989,30,8520,5,6,1923,2006,968,0,0,...,0,0,0,184,0,0,0,0,7,2007
4,625,20,9500,6,5,1963,1963,1394,0,0,...,2,0,76,0,0,185,0,0,7,2009


In [46]:
#stash id column
test_id = df_test['id']

#feature engineering simulation
df_test = poly.transform(df_test.drop(columns='id'))

In [47]:
#standard scale and recreate dataframe
Xs_test = pd.DataFrame(
    ss.transform(df_test),
    columns = poly.get_feature_names_out()
)



In [48]:
lasso_sub = Lasso(alpha = lasso.alpha_)

In [49]:
lasso_sub.fit(Xs_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [50]:
# make predictions
preds = lasso_sub.predict(Xs_test)

In [51]:
preds.shape

(878,)

In [52]:
preds = pd.DataFrame(preds,columns = ['SalePrice'])

In [53]:
preds.insert(loc = 0, column= 'Id', value = test_id)

In [54]:
preds

Unnamed: 0,Id,SalePrice
0,2658,153184.730208
1,2718,142415.989993
2,2414,183923.012326
3,1989,112071.958068
4,625,159215.397782
...,...,...
873,1662,177266.997781
874,1234,180706.695005
875,1373,129316.822789
876,1672,121207.955685


In [55]:
#save submission csv and get on that leaderboard!

In [56]:
preds.to_csv('./lasso_3_simple.csv', index = False)

In [57]:
# create function to store predictions into a dataframe