# Model Training Section

In [43]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn 
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

## Reading the data

In [44]:
df= pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Train Test Split

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:80],df.iloc[:,-1], test_size=0.25, random_state=42)

## One hot encoding of categorical features for train set

In [46]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse=False)

In [47]:
X_train_new = ohe.fit_transform(X_train[['GarageCars','Neighborhood']])

In [48]:
#Concatenating the other two continuous features with the one hot encoded categorical features

X_train_neww = np.hstack((X_train[['GrLivArea','LotArea']].values,X_train_new))

## Fitting the model

In [49]:
from sklearn.linear_model import LinearRegression
reg_multiple = LinearRegression()
reg_multiple.fit(X_train_neww, y_train)

LinearRegression()

# Model Evaluation Section

## One hot encoding of categorical features for test set

In [50]:
X_test_new = ohe.fit_transform(X_test[['GarageCars','Neighborhood']])

In [51]:
##Concatenating the other two continuous features with the one hot encoded categorical features

X_test_neww = np.hstack((X_test[['GrLivArea','LotArea']].values,X_test_new))

## Defining functions to asses the performance

In [52]:
def assess_performance(y_true, y_pred):
    print("Mean square error (MSE): %.2f" % mean_squared_error(y_true, y_pred))
    print("Root mean square error (RMSE): %.2f" % mean_squared_error(y_true, y_pred, squared=False))
    print("Mean absolute error (MAE): %.2f" % mean_absolute_error(y_true, y_pred))
    print("R^2 score is: %.2f" % r2_score(y_true, y_pred))

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

## Prediction Quality

In [53]:
y_pred = reg_multiple.predict(X_test_neww)
assess_performance(y_test, y_pred)

Mean square error (MSE): 1438380558.90
Root mean square error (RMSE): 37925.99
Mean absolute error (MAE): 25087.81
R^2 score is: 0.79


## Model evaluation with the competition metric Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price

In [54]:
compute_rmsle(y_test, y_pred)

0.2