# Read in Data

In [32]:
# import data
import pandas as pd
import numpy as np

train = pd.read_csv("https://raw.githubusercontent.com/esnt/Data/main/Ames/ames_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/esnt/Data/main/Ames/ames_test.csv")
pid = test['PID'] # seperating the PID for the later export
X_test = test.iloc[:,1:80] # Removing PID from the feature list

# split training into X and y (test doesn't have y values)
X_train = train.iloc[:,1:80]
ytrain = train["SalePrice"]

# combine X_train and X_test for standardization
X_all = pd.concat([X_train, X_test])

# Data Cleaning

In [33]:
# NA values - fill w/ 0 or DNA
cont = ['Lot Frontage', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Garage Yr Blt', 'Garage Cars', 'Garage Area']
cat = ['Alley', 'Mas Vnr Type', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Electrical', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Pool QC', 'Fence', 'Misc Feature']

X_all.loc[:,cont] = X_all.loc[:,cont].fillna(0)
X_all.loc[:,cat] = X_all.loc[:,cat].fillna("Does Not Apply")

# Categorical to dummy
X_all = pd.get_dummies(X_all)
columns = X_all.columns # Pulling out column names to determine most important features later

# standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xall = scaler.fit_transform(X_all)

# split back into training/test
Xtrain = Xall[:1758,:]
Xtest = Xall[1758:, :]

# Lasso

In [34]:
# tune alpha hyperparameter
from sklearn.linear_model import Lasso, LassoCV

l_cv = LassoCV(cv = 10)
l_cv.fit(Xtrain, ytrain)
l_cv.alpha_

617.1456961471429

In [35]:
# fit Lasso with best alpha
lasso = Lasso(alpha=l_cv.alpha_)
lasso.fit(Xtrain, ytrain)

Lasso(alpha=617.1456961471429)

In [36]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# metrics for Lasso
print(f"MSE: {round(mean_squared_error(ytrain, l_cv.predict(Xtrain)),2)}")
print(f"RMSE: {round(np.sqrt(mean_squared_error(ytrain, l_cv.predict(Xtrain))),2)}")
print(f"MAE: {round(mean_absolute_error(ytrain, l_cv.predict(Xtrain)),2)}")
print(f"Y Standard Deviation: {round(np.std(ytrain), 2)}")

MSE: 488753684.09
RMSE: 22107.77
MAE: 13952.36
Y Standard Deviation: 82296.03


In [41]:
feature_df = pd.DataFrame({
    'Features' : columns,
    'Coefficients' : lasso.coef_
})
feature_df.sort_values(by='Coefficients', ascending=False).head()['Features']

15      Gr Liv Area
3      Overall Qual
8      BsmtFin SF 1
5        Year Built
11    Total Bsmt SF
Name: Features, dtype: object

# PCA + Lasso

In [6]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# run through PCA, then put reduced Xtrain through lasso
pipe = Pipeline([('pca', PCA()),
                 ('model', Lasso())])

# tune both PCA and Lasso hyperparameters
params = {"pca__n_components": list(np.arange(.9, 1.0, 0.01)),
               "model__alpha": list(range(200, 402, 5))}

search = GridSearchCV(pipe, params, n_jobs=-1, scoring="neg_mean_squared_error")

search.fit(Xtrain, ytrain)

search.best_params_

{'model__alpha': 235, 'pca__n_components': 0.9900000000000001}

In [7]:
# fit PCA and Lasso with best hyperparameters
lasso_pca = Pipeline([('pca', PCA(n_components= search.best_params_.get('pca__n_components'))),
                      ('model', Lasso(alpha = search.best_params_.get('model__alpha')))])

lasso_pca.fit(Xtrain, ytrain)

Pipeline(steps=[('pca', PCA(n_components=0.9900000000000001)),
                ('model', Lasso(alpha=235))])

In [15]:
from sklearn.model_selection import cross_val_score

# PCA and Lasso metrics
mse = cross_val_score(lasso_pca, Xtrain, ytrain, cv=10, scoring="neg_mean_squared_error").mean()*-1
print(mse)
print(np.sqrt(mse))
mae = cross_val_score(lasso_pca, Xtrain, ytrain, cv=10, scoring="neg_mean_absolute_error").mean()*-1
print(mae)

755566303.5758082
27487.566345091524
16536.25419683217


# PCA + KNN

In [9]:
from sklearn.neighbors import KNeighborsRegressor

# run through PCA, then put reduced Xtrain through KNN
pipe = Pipeline([('pca', PCA()),
                 ('model', KNeighborsRegressor())])

# tune hyperparameters for PCA and KNN together
params = {"pca__n_components": list(np.arange(.9, 1.0, 0.01)),
               "model__n_neighbors": list(range(2, 52))}

search = GridSearchCV(pipe, params, n_jobs=-1, scoring="neg_mean_squared_error")

search.fit(Xtrain, ytrain)

search.best_params_

{'model__n_neighbors': 9, 'pca__n_components': 0.9}

In [10]:
# fit PCA and KNN with best hyperparameters
knn_pca = Pipeline([('pca', PCA(n_components= search.best_params_.get('pca__n_components'))),
                      ('model', KNeighborsRegressor(n_neighbors = search.best_params_.get('model__n_neighbors')))])

knn_pca.fit(Xtrain, ytrain)

Pipeline(steps=[('pca', PCA(n_components=0.9)),
                ('model', KNeighborsRegressor(n_neighbors=9))])

In [11]:
# PCA and KNN metrics
mse = cross_val_score(knn_pca, Xtrain, ytrain, cv=10, scoring="neg_mean_squared_error").mean()*-1
print(mse)
print(np.sqrt(mse))
mae = cross_val_score(knn_pca, Xtrain, ytrain, cv=10, scoring="neg_mean_absolute_error").mean()*-1
print(mae)

1651576436.432738
40639.59198162228


# Elastic Net

In [19]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

# try elastic net without PCA
# train alpha and l1 ratio hyperparameters
en_cv = ElasticNetCV(cv = 10)
en_cv.fit(Xtrain, ytrain)
print(en_cv.alpha_)
print(en_cv.l1_ratio_)


132.34902023666493
0.5


In [20]:
# fit elastic net with best hyperparameters
elastic_net = ElasticNet(alpha = en_cv.alpha_, l1_ratio = en_cv.l1_ratio_)
elastic_net.fit(Xtrain, ytrain)

ElasticNet(alpha=132.34902023666493)

In [21]:
# metrics for elastic net
print(mean_squared_error(ytrain, en_cv.predict(Xtrain)))
print(np.sqrt(mean_squared_error(ytrain, en_cv.predict(Xtrain))))
print(mean_absolute_error(ytrain, en_cv.predict(Xtrain)))
print(np.std(ytrain))

4774793411.214735
69099.8799652701
48333.00229243753
82296.03061151424
