In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('../Datasets/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../Datasets/house-prices-advanced-regression-techniques/test.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
### EDA

# Defining plots design
def plots_design():
    fig.patch.set_facecolor('black')
    ax.patch.set_facecolor('black')
    ax.tick_params(axis='both', which='major', labelsize=8)
    ax.yaxis.set_label_coords(0, 0)
    ax.grid(color='white', linewidth=2)
    # Remove ticks
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    # Remove axes splines
    for i in ['top', 'bottom', 'left', 'right']:
        ax.spines[i].set_visible(False)
    ax.tick_params(axis='x', colors='white')
    ax.tick_params(axis='y', colors='white')
    # Font
    mpl.rcParams['font.family'] = 'Source Sans Pro'

In [3]:

# let's save the ID of each dataset
train_id = train['Id']
test_id = test['Id']
del train['Id']
del test['Id']

In [4]:
train1 = train.copy()
train1 = train1.drop(train1[(train1['GarageArea']>1200) & (train1['SalePrice']<300000)].index)
train1 = train1.drop(train1[(train1['GrLivArea']>4000) & (train1['SalePrice']<300000)].index)
train1 = train1.drop(train1[(train1['TotalBsmtSF']>5000)].index)

In [5]:
print('Outliers removed =' , train.shape[0] - train1.shape[0])

Outliers removed = 5


In [6]:
# Split X and y (in train dataset)
X = train1.drop('SalePrice', axis=1)
y = train1['SalePrice'].to_frame()

# Add variable
X['train'] = 1
test['train'] = 0

# Combining train and test for data cleaning 
df = pd.concat([test, X])

In [7]:
print('Count of Features per Data Type:')
df.dtypes.value_counts()  

Count of Features per Data Type:


object     43
int64      26
float64    11
dtype: int64

In [8]:
# Do we have duplicates?
print('Number of Duplicates:', len(df[df.duplicated()]))

# Do we have missing values?
print('Number of Missing Values:', df.isnull().sum().sum())

Number of Duplicates: 0
Number of Missing Values: 13945


In [9]:
print('Missing Values per Column:')
df.isnull().sum().sort_values(ascending=False).head(25)

Missing Values per Column:


PoolQC          2905
MiscFeature     2810
Alley           2716
Fence           2343
FireplaceQu     1419
LotFrontage      485
GarageCond       159
GarageQual       159
GarageYrBlt      159
GarageFinish     159
GarageType       157
BsmtCond          82
BsmtExposure      82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
Functional         2
BsmtFullBath       2
Utilities          2
BsmtUnfSF          1
KitchenQual        1
dtype: int64

In [10]:
df['PoolQC'] = df['PoolQC'].fillna('None')
df['MiscFeature'] = df['MiscFeature'].fillna('None')
df['Alley'] = df['Alley'].fillna('None')
df['Fence'] = df['Fence'].fillna('None')
df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda i: i.fillna(i.median()))

In [11]:
# Let's take a look at the "Garage" features
garage_cols = [col for col in df if col.startswith('Garage')]
df[garage_cols]

Unnamed: 0,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond
0,Attchd,1961.0,Unf,1.0,730.0,TA,TA
1,Attchd,1958.0,Unf,1.0,312.0,TA,TA
2,Attchd,1997.0,Fin,2.0,482.0,TA,TA
3,Attchd,1998.0,Fin,2.0,470.0,TA,TA
4,Attchd,1992.0,RFn,2.0,506.0,TA,TA
...,...,...,...,...,...,...,...
1455,Attchd,1999.0,RFn,2.0,460.0,TA,TA
1456,Attchd,1978.0,Unf,2.0,500.0,TA,TA
1457,Attchd,1941.0,RFn,1.0,252.0,TA,TA
1458,Attchd,1950.0,Unf,1.0,240.0,TA,TA


In [12]:
# For the numerical features:
for i in df[garage_cols].select_dtypes(exclude='object').columns:
    df[i] = df[i].fillna(0)

# For the categorical features:
for i in df[garage_cols].select_dtypes(include='object').columns:
    df[i] = df[i].fillna('None')

In [13]:
bsmt_cols = [col for col in df if col.startswith('Bsmt')]

# For the numerical features:
for i in df[bsmt_cols].select_dtypes(exclude='object').columns:
    df[i] = df[i].fillna(0)

# For the categorical features:
for i in df[bsmt_cols].select_dtypes(include='object').columns:
    df[i] = df[i].fillna('None')

In [14]:
mas_cols = [col for col in df if col.startswith('Mas')]

# For the numerical features:
for i in df[mas_cols].select_dtypes(exclude='object').columns:
    df[i] = df[i].fillna(0)

# For the categorical features:
for i in df[mas_cols].select_dtypes(include='object').columns:
    df[i] = df[i].fillna('None')

In [15]:
df['MSZoning'] = df.groupby('Neighborhood')['MSZoning'].transform(lambda i: i.fillna(i.value_counts().index[0]))
print('Missing Values left:')
df.isnull().sum().sort_values(ascending=False).head(10)

Missing Values left:


Functional      2
Utilities       2
Electrical      1
TotalBsmtSF     1
KitchenQual     1
Exterior1st     1
Exterior2nd     1
SaleType        1
BsmtHalfBath    0
BsmtFullBath    0
dtype: int64

In [16]:
# replace missing values for mode of each column
df = df.fillna(df.mode().iloc[0])

In [17]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MSSubClass,2914.0,57.112217,42.474217,20.0,20.0,50.0,70.0,190.0
LotFrontage,2914.0,69.406829,21.19113,21.0,60.0,70.0,80.0,313.0
LotArea,2914.0,10128.200755,7798.584415,1300.0,7473.0,9450.0,11546.25,215245.0
OverallQual,2914.0,6.087509,1.405287,1.0,5.0,6.0,7.0,10.0
OverallCond,2914.0,5.566232,1.113182,1.0,5.0,5.0,6.0,9.0
YearBuilt,2914.0,1971.291352,30.286886,1872.0,1953.25,1973.0,2001.0,2010.0
YearRemodAdd,2914.0,1984.254633,20.887641,1950.0,1965.0,1993.0,2004.0,2010.0
MasVnrArea,2914.0,100.879204,178.071569,0.0,0.0,0.0,162.75,1600.0
BsmtFinSF1,2914.0,438.919012,444.059991,0.0,0.0,368.0,732.75,4010.0
BsmtFinSF2,2914.0,49.650309,169.311762,0.0,0.0,0.0,0.0,1526.0


In [18]:
df['MSSubClass'] = df['MSSubClass'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)           # months is always categorical
df['YrSold'] = df['YrSold'].astype(str)           # year sold just have 5 years
df['Total_House_SF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['Total_Home_Quality'] = (df['OverallQual'] + df['OverallCond'])/2
df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))

In [19]:
numeric_cols = df.select_dtypes(exclude='object').columns

skew_limit = 0.5
skew_vals = df[numeric_cols].skew()

skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {0}'.format(skew_limit)))

skew_cols

Unnamed: 0,Skew
MiscVal,21.949442
PoolArea,17.688586
LotArea,13.168399
LowQualFinSF,12.084424
3SsnPorch,11.371955
KitchenAbvGr,4.300206
BsmtFinSF2,4.144176
EnclosedPorch,4.002083
ScreenPorch,3.944742
BsmtHalfBath,3.929621


In [20]:
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Normalize skewed features
for col in skew_cols.index:
    df[col] = boxcox1p(df[col], boxcox_normmax(df[col] + 1))



In [21]:
# log(1+x) transform
y["SalePrice"] = np.log1p(y["SalePrice"])

In [22]:
categ_cols = df.dtypes[df.dtypes == np.object]        # filtering by categorical variables
categ_cols = categ_cols.index.tolist()                # list of categorical fields

df_enc = pd.get_dummies(df, columns=categ_cols, drop_first=True)   # One hot encoding

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categ_cols = df.dtypes[df.dtypes == np.object]        # filtering by categorical variables


In [23]:
X = df_enc[df_enc['train']==1]
test = df_enc[df_enc['train']==0]
X.drop(['train'], axis=1, inplace=True)
test.drop(['train'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(['train'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['train'], axis=1, inplace=True)


In [24]:
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [25]:
def rmse(ytrue, ypredicted):
    return np.sqrt(mean_squared_error(ytrue, ypredicted))

In [27]:
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming X_train, y_train, X_test, y_test are already defined

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Cross-validated Lasso to find the best alpha
lassocv = LassoCV(alphas=None, cv=10, max_iter=100000)
lassocv.fit(X_train_scaled, y_train)

# Fit Lasso with the best alpha found
lasso = Lasso(alpha=lassocv.alpha_, max_iter=100000)
lasso.fit(X_train_scaled, y_train)

# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Print results
print('The Lasso I:')
print("Alpha =", lassocv.alpha_)
print("RMSE =", rmse(y_test, lasso.predict(X_test_scaled)))


  y = column_or_1d(y, warn=True)


The Lasso I:
Alpha = 0.002045046985345592
RMSE = 0.12105392552795358


In [29]:
import numpy as np
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Assuming X_train, y_train, X_test, y_test are already defined

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Setting up the alpha values
alpha = np.geomspace(1e-5, 1e0, num=6)

# Cross-validated Lasso to find the best alpha
lasso_cv_model = LassoCV(alphas=alpha, cv=10, max_iter=100000).fit(X_train_scaled, y_train)

# Fit Lasso with the best alpha found
lasso_tuned = Lasso(max_iter=100000).set_params(alpha=lasso_cv_model.alpha_).fit(X_train_scaled, y_train)

# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Print results
print('The Lasso II:')
print("Alpha =", lasso_cv_model.alpha_)
print("RMSE =", rmse(y_test, lasso_tuned.predict(X_test_scaled)))


  y = column_or_1d(y, warn=True)


The Lasso II:
Alpha = 0.001
RMSE = 0.12479759062044955


In [30]:
import numpy as np
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Assuming X_train, y_train, X_test, y_test are already defined

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Setting up the alpha values
alphas = np.geomspace(1e-9, 5, num=100)

# Cross-validated Ridge to find the best alpha
ridgecv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error')
ridgecv.fit(X_train_scaled, y_train)

# Fit Ridge with the best alpha found
ridge = Ridge(alpha=ridgecv.alpha_)
ridge.fit(X_train_scaled, y_train)

# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Print results
print('Ridge Regression:')
print("Alpha =", ridgecv.alpha_)
print("RMSE =", rmse(y_test, ridge.predict(X_test_scaled)))


Ridge Regression:
Alpha = 5.0
RMSE = 0.12735783962446684


In [33]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [34]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Elastic Net Regression", *evaluate(y_test, test_pred) , cross_val(ElasticNet())]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df_2

  model = cd_fast.enet_coordinate_descent(


Test set evaluation:
_____________________________________
MAE: 0.10614772658706316
MSE: 0.024379983486398495
RMSE: 0.1561409090738186
R2 Square 0.8414819765164441
__________________________________
Train set evaluation:
_____________________________________
MAE: 0.11063726097714477
MSE: 0.023239400888848696
RMSE: 0.15244474700313126
R2 Square 0.8565146637434446
__________________________________


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


NameError: name 'results_df' is not defined

In [39]:
import pandas as pd
import numpy as np
from sklearn.linear_model import RANSACRegressor, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Assuming X_train, y_train, X_test, y_test are already defined

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define evaluation functions
def print_evaluate(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    print(f'MAE: {mae}')
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    print(f'R2 Square: {r2}')

def evaluate(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, mse, rmse, r2

def cross_val(model):
    scores = cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
    return np.sqrt(-scores).mean()

# Robust Regression
model = RANSACRegressor(estimator=LinearRegression(), max_trials=100)
model.fit(X_train_scaled, y_train)

test_pred = model.predict(X_test_scaled)
train_pred = model.predict(X_train_scaled)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

results_df_2 = pd.DataFrame(data=[["Robust Regression", *evaluate(y_test, test_pred), cross_val(RANSACRegressor(estimator=LinearRegression(), max_trials=100))]], 
                            columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2 Square', "Cross Validation"])
results_df_2

Test set evaluation:
_____________________________________
MAE: 4976373345.587586
MSE: 8.223440728740376e+20
RMSE: 28676542205.678104
R2 Square: -5.346859940579328e+21
Train set evaluation:
_____________________________________
MAE: 5957795961.344128
MSE: 9.897143911822299e+20
RMSE: 31459726495.667915
R2 Square: -6.110721308864583e+21


Unnamed: 0,Model,MAE,MSE,RMSE,R2 Square,Cross Validation
0,Robust Regression,4976373000.0,8.223441e+20,28676540000.0,-5.34686e+21,24996990000.0
