In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
housing_prices_df_raw = pd.read_csv('../src/data/train.csv')
test_housing_prices_df_raw = pd.read_csv('../src/data/test.csv')

In [None]:
train_hdf = housing_prices_df_raw.copy()
test_hdf = test_housing_prices_df_raw.copy()
hdf = pd.concat([train_hdf, test_hdf], axis = 0, sort = False)

# Feature Selection/Data Wragling

In [None]:
ord_feat_num = ['OverallQual', 'OverallCond', 'BsmtFullBath', 
        'BsmtHalfBath', 'FullBath', 'HalfBath',
        'TotRmsAbvGrd', 'Fireplaces', 'BedroomAbvGr', 
        'KitchenAbvGr', 'GarageCars']

ord_feat_cat = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
        'HeatingQC', 'KitchenQual', 'FireplaceQu', 
        'GarageQual', 'GarageCond', 'PoolQC']

ord_feat = ord_feat_num + ord_feat_cat

nom_feat = ['MSSubClass', 'MSZoning', 'Alley', 'LotShape', 
       'LandContour', 'Utilities', 'Neighborhood', 
       'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
       'Foundation', 'Heating', 'CentralAir', 'Electrical', 
       'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition', 
       'GarageFinish', 'PavedDrive', 'Fence', 'Functional', 
       'HouseStyle','LotConfig', 'Street', 'LandSlope']

cat_feat = nom_feat + ord_feat

cont_feat = ['LotFrontage', 'LotArea', 'YearBuilt', 
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 
       'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
       'GarageYrBlt', 'WoodDeckSF', 'OpenPorchSF', 
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
       'MiscVal', 'MoSold', 'YrSold','BsmtUnfSF', 'GarageArea', 
       'LowQualFinSF', 'GrLivArea']

In [None]:
hdf['FireplaceQu'].unique()

## Filling NA's

In [None]:
# hdf.loc[:,hdf.isnull().sum() > 0]
# hdf.isnull().sum()[hdf.isnull().sum() > 0]

In [None]:
# In percentage
print(hdf.shape)
round(100*hdf.isnull().sum()[hdf.isnull().sum() > 0]/len(hdf), 2)

In [None]:
from scipy import stats


hdf[ord_feat_cat] = hdf[ord_feat_cat].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0})
hdf[['BsmtExposure']] = hdf[['BsmtExposure']].replace({'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, np.nan: 0})
hdf[['BsmtFinType1', 'BsmtFinType2']] = hdf[['BsmtFinType1', 
                                             'BsmtFinType2']].replace({'GLQ': 6, 'ALQ': 5, 
                                                                       'BLQ': 4, 'Rec': 3, 
                                                                       'LwQ': 2, 'Unf': 1, 
                                                                       np.nan:0})
hdf[['Fence']] = hdf[['Fence']].replace({'MnPrv': 'HasFence', 
                                         'GdWo': 'HasFence', 
                                         'GdPrv': 'HasFence', 
                                         'MnWw': 'HasFence',
                                          np.nan: 'NoFence'})


# Inpute LotFrontage
a = hdf.groupby('Neighborhood')['LotFrontage'].agg('median')

hdf['LotFrontage'] = hdf.apply(lambda row: a.loc[row['Neighborhood']] if 
          np.isnan(row['LotFrontage']) else row['LotFrontage'], axis = 1)



for col in set(hdf.columns) - {'SalePrice'}:
    if hdf[col].dtype == 'object':
        hdf.fillna({col:stats.mode(hdf[col]).mode[0]}, inplace = True)
    else:
        hdf.fillna({col:np.median(hdf.loc[~hdf[col].isnull(), col])}, inplace = True)
        
        
# hdf.fillna({'PoolQC':}, inplace = True)
# hdf.fillna({'FireplaceQu':0}, inplace = True)
# hdf.fillna({'Alley':0}, inplace = True)
# hdf.fillna({gar:'NoGar' for gar in ['GarageFinish', 'GarageQual', 'GarageCond', 'GarageType']}, inplace=True)
# hdf.fillna({'GarageYrBlt':int(stats.mode(hdf['GarageYrBlt']).mode[0])}, inplace = True)
# hdf.fillna({bsmt:'NoBsmt' for bsmt in ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']},
#             inplace = True)

In [None]:
round(100*hdf.isnull().sum()[hdf.isnull().sum() > 0]/len(hdf), 2)

# Feature generation

In [None]:
hdf['remodeled'] = pd.Series([1 if a > 0 else 0 for a in (hdf['YearRemodAdd'] - hdf['YearBuilt'])])
hdf['TotalPorchAreasSF'] = hdf['OpenPorchSF'] + hdf['EnclosedPorch'] + hdf['3SsnPorch'] + hdf['ScreenPorch'] + \
                              hdf['WoodDeckSF']
hdf ['TotalBath'] = hdf['FullBath'] + hdf['BsmtFullBath'] + .5*(hdf['HalfBath'] + hdf['BsmtHalfBath'])
hdf['RestRooms'] = hdf['TotRmsAbvGrd'] - hdf['KitchenAbvGr'] - hdf['FullBath']

In [None]:
hdf.drop(columns = ['Utilities', 'Street', 'PoolQC'], inplace = True)
hdf.drop(columns = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], inplace = True) #there is totalbsmt
hdf.drop(columns = ['FullBath', 'BsmtFullBath', 'HalfBath','BsmtHalfBath'], inplace = True)
hdf.drop(columns = ['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'WoodDeckSF'], inplace = True)
hdf.drop(columns = 'MiscFeature', inplace = True) # drop feature (since price of feature is in miscval)
hdf.drop(columns = 'TotRmsAbvGrd', inplace = True)

In [None]:
cat_feat = set(cat_feat) - set(['MiscFeature', 'Utilities', 'Street', 
                                'FullBath', 'PoolQC', 'HalfBath', 'BsmtHalfBath', 
                                'BsmtFullBath', 'PoolQC', 'TotRmsAbvGrd'])

cont_feat = list(set(cont_feat) - set(['Utilities', 'Street', 'PoolQC'] +\
['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'] +\
['FullBath', 'BsmtFullBath', 'HalfBath','BsmtHalfBath'] +\
['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'WoodDeckSF'] +\
['MiscFeature', 'TotRmsAbvGrd']))


nom_feat = list(set(nom_feat) - set(['MiscFeature', 'Utilities', 'Street']))

# Train and test split/Remove outliers

In [None]:
# Dummify and Transforming prine to log price
X = pd.get_dummies(hdf, columns = nom_feat, drop_first=True)

X_train = X.loc[~X['SalePrice'].isnull(), :]


# #Removing Outliers
X_train = X_train.loc[(X_train['GrLivArea'] < 4000) & (X_train['LotArea'] < 100000) & (X_train['LotFrontage'] < 250),:]


y_train = np.log1p(X_train.loc[~X_train['SalePrice'].isnull(), 'SalePrice'])

X_train.drop(columns = ['Id', 'SalePrice'], inplace = True)
X_test = X.loc[X['SalePrice'].isnull(), :].drop(columns = ['SalePrice'])

# Normalization of the columns if needed

In [None]:
# # Normalization. Use this one instead of the model option since we have to normalize the test dataset as well.
for col in X_train.columns:
    X_train[col] = (X_train[col] - np.mean(X_train[col]))/np.std(X_train[col], ddof=1)
    X_test[col] = (X_test[col] - np.mean(X_test[col]))/np.std(X_test[col], ddof=1)

# Initialize

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Drop columns with low variance/Not advised

In [None]:
vr = []
for col in X.columns:
    vr.append((np.var(X[col]), col))

In [None]:
X_LV = X.copy()
X_LV = X_LV.drop(columns = list(map(lambda x: x[1], filter(lambda x: x[0] > .8*(1-.8), vr))))

In [None]:
# Normalization
for col in X_LV.columns:
    X_LV[col] = (X_LV[col] - np.mean(X_LV[col]))/np.std(X_LV[col], ddof=1)

# Lasso feature selection

In [None]:
from sklearn.linear_model import Lasso
import seaborn as sns

In [None]:
lasso_lm = Lasso(max_iter=1e7, warm_start=True, normalize = True)

In [None]:
## Try this as well. takes a long time!
lasso_coefs = []
alphas      = np.logspace(-6, 1, 1000)

In [None]:
lasso_coefs = []
alphas      = np.logspace(-2, 0.5, 2)

In [None]:
for alpha in alphas:
    lasso_lm.set_params(alpha = alpha).fit(X_train, y_train)
    lasso_coefs.append(lasso_lm.coef_)

In [None]:
df_coef = pd.DataFrame(lasso_coefs, index=alphas, columns = X_train.columns)
title = 'Lasso coefficients as a function of the regularization'
df_coef.plot(logx=True, title=title, legend = False)
plt.xlabel('alpha')
plt.ylabel('coefficients')
plt.show()

In [None]:
df_coef.iloc[:, np.random.choice(range(df_coef.shape[1]), size = 5, replace = False)].plot(logx=True, title=title)

In [None]:
coef_orders = []
for row in df_coef.index:
    coef_orders.extend([list(df_coef.loc[:,df_coef.loc[row, :] > 0.001].columns)])

In [None]:
last = []
s = 0
for i in sorted(np.unique(coef_orders), key = len):
    print (set(i) - set(last))
    s += len(set(i) - set(last))
    last = i
    
print (s)

In [None]:
# sns.pairplot(hdf[['YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'GarageArea']])

In [None]:
corr = hdf.corr()

f, ax = plt.subplots(figsize = (11,9))
sns.heatmap(corr)

In [None]:
hdf.plot(kind = 'scatter', x = 'YearRemodAdd', y =  'SalePrice', logy = True)

In [None]:
hdf.plot(kind = 'scatter', x = 'YearBuilt', y =  'SalePrice', logy = True)

# ------------------------------------------------------------------------------------------

# Grid search CV in lasso regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

In [None]:
lasso_gs = Lasso(max_iter = 1e7, warm_start = True, normalize = True)

In [None]:
grid_params = [{'alpha': np.logspace(-6, -2, 10)}]

In [None]:
lasso_param_search = GridSearchCV(estimator = lasso_gs, 
                                  param_grid = grid_params, cv=10, 
                                  return_train_score=True,
                                  scoring='r2', verbose=2)

In [None]:
lasso_param_search.fit(X_train, y_train)

In [None]:
mean_test_score = lasso_param_search.cv_results_['mean_test_score']
std_test_score = lasso_param_search.cv_results_['std_test_score']
mean_train_score = lasso_param_search.cv_results_['mean_train_score']
std_train_score = lasso_param_search.cv_results_['std_train_score']

params = list(map(lambda s: s['alpha'], lasso_param_search.cv_results_['params']))

In [None]:
lasso_vis_df = pd.DataFrame({'param': params, 'mean_test_score': mean_test_score,
                             'std_test_score': std_test_score, 
                             'mean_train_score':mean_train_score,
                             'std_train_score': std_train_score})

In [None]:
lasso_vis_df.plot(x = 'param', y = ['mean_train_score', 'mean_test_score'])

In [None]:
lasso_gs2 = Lasso(max_iter = 1e7, warm_start = True)

In [None]:
grid_params2 = [{'alpha': np.linspace(0.002, 0.004, 100)}]

In [None]:
lasso_param_search2 = GridSearchCV(estimator = lasso_gs2, 
                                  param_grid = grid_params2, cv=10, 
                                  return_train_score=True,
                                  scoring='r2', verbose=2)

In [None]:
lasso_param_search2.fit(X_train, y_train)

In [None]:
mean_test_score = lasso_param_search2.cv_results_['mean_test_score']
std_test_score = lasso_param_search2.cv_results_['std_test_score']
mean_train_score = lasso_param_search2.cv_results_['mean_train_score']
std_train_score = lasso_param_search2.cv_results_['std_train_score']

params = list(map(lambda s: s['alpha'], lasso_param_search2.cv_results_['params']))

In [None]:
lasso_vis_df = pd.DataFrame({'param': params, 'mean_test_score': mean_test_score,
                             'std_test_score': std_test_score, 
                             'mean_train_score':mean_train_score,
                             'std_train_score': std_train_score})

In [None]:
lasso_vis_df.plot(x = 'param', y = ['mean_train_score', 'mean_test_score'])

In [None]:
best_alpha = lasso_param_search2.best_estimator_.get_params()['alpha']

In [None]:
y_predict = np.expm1(lasso_param_search2.predict(X_test.loc[:,X_test.columns != 'Id']))

submission = pd.DataFrame({'Id': X_test['Id'], 'SalePrice': y_predict})
submission