In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [49]:
df = pd.read_csv(r"house_price.csv")

In [None]:
df.info()

In [51]:
dropColumns = ["Id", "MSSubClass", "MSZoning", "Street", "LandContour", "Utilities", "LandSlope", "Condition1", "Condition2", "BldgType", "OverallCond", "RoofStyle", 
               "RoofMatl", "Exterior1st", "Exterior2nd","MasVnrType", "ExterCond", "Foundation", "BsmtCond", "BsmtExposure", "BsmtFinType1",
              "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "Heating", "Electrical", "LowQualFinSF", "BsmtFullBath", "BsmtHalfBath", "HalfBath"] + ["SaleCondition", "SaleType", "YrSold", "MoSold", "MiscVal", "MiscFeature", "Fence", "PoolQC", "PoolArea", "ScreenPorch", "3SsnPorch", "EnclosedPorch", "OpenPorchSF", "WoodDeckSF", "PavedDrive", "GarageCond", "GarageQual", "GarageType", "FireplaceQu", "Functional", "KitchenAbvGr", "BedroomAbvGr"]

droppedDf = df.drop(columns=dropColumns, axis=1)
# droppedDf.head()

In [None]:
# import pandas_profiling 

# profile = droppedDf.profile_report(title='Pandas Profiling Report')
# profile.to_file(output_file="Data_Profiling_v3.html")

In [53]:
droppedDf.isnull().sum().sort_values(ascending=False)

Alley           1369
LotFrontage      259
GarageFinish      81
GarageYrBlt       81
BsmtQual          37
MasVnrArea         8
CentralAir         0
GarageArea         0
GarageCars         0
Fireplaces         0
TotRmsAbvGrd       0
KitchenQual        0
FullBath           0
GrLivArea          0
2ndFlrSF           0
1stFlrSF           0
TotalBsmtSF        0
HeatingQC          0
LotArea            0
BsmtFinSF1         0
ExterQual          0
YearRemodAdd       0
YearBuilt          0
OverallQual        0
HouseStyle         0
Neighborhood       0
LotConfig          0
LotShape           0
SalePrice          0
dtype: int64

In [54]:
droppedDf["Alley"].isna().sum()

1369

In [55]:
droppedDf["Alley"].fillna("NO", inplace=True)

In [56]:
droppedDf["LotFrontage"].fillna(df.LotFrontage.mean(), inplace=True)
# droppedDf["LotFrontage"]


In [57]:
droppedDf["GarageFinish"].fillna("NO", inplace=True)

In [58]:
droppedDf["GarageYrBlt"].fillna(df.GarageYrBlt.mean(), inplace=True)
# droppedDf["GarageYrBlt"]

In [59]:
droppedDf["BsmtQual"].fillna("NO", inplace=True)
# droppedDf["BsmtQual"]

In [60]:
droppedDf["MasVnrArea"].fillna(0, inplace=True)
# droppedDf["MasVnrArea"]

In [61]:
droppedDf['MasVnrAreaCatg'] = np.where(droppedDf.MasVnrArea>1000,'BIG',
                                      np.where(droppedDf.MasVnrArea>500,'MEDIUM',
                                              np.where(droppedDf.MasVnrArea>0,'SMALL','NO')))

In [None]:
droppedDf.info()

In [62]:
inputDf = droppedDf.drop(['SalePrice'],axis=1)
inputDf = inputDf.iloc[[0]].copy()

In [72]:
for i in inputDf:
    if inputDf[i].dtype == "object":
        inputDf[i] = droppedDf[i].mode()[0]
    elif inputDf[i].dtype == "int64" or inputDf[i].dtype == "float64":
        inputDf[i] = droppedDf[i].mean()
inputDf

obj_feat = list(inputDf.loc[:, inputDf.dtypes == 'object'].columns.values)
for feature in obj_feat:
    inputDf[feature] = inputDf[feature].astype('category')

## Modeling

### Importing the libraries

In [64]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [65]:
df = droppedDf.copy()

In [66]:
obj_feat = list(df.loc[:, df.dtypes == 'object'].columns.values)
for feature in obj_feat:
    df[feature] = df[feature].astype('category')

In [67]:
# To define the input and output feature
x = df.drop(['SalePrice'],axis=1)
y = df.SalePrice

# train and test split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=1)
x.iloc[0].index

Index(['LotFrontage', 'LotArea', 'Alley', 'LotShape', 'LotConfig',
       'Neighborhood', 'HouseStyle', 'OverallQual', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'BsmtQual', 'BsmtFinSF1',
       'TotalBsmtSF', 'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'FullBath', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
       'MasVnrAreaCatg'],
      dtype='object')

In [68]:
model = lgb.LGBMRegressor(max_depth=5, 
                          n_estimators = 100, 
                          learning_rate = 0.2,
                          min_child_samples = 30)
model.fit(x_train, y_train)

pred_y_train = model.predict(x_train)
pred_y_test = model.predict(x_test)

r2_train = metrics.r2_score(y_train, pred_y_train)
r2_test = metrics.r2_score(y_test, pred_y_test)

msle_train =metrics.mean_squared_log_error(y_train, pred_y_train)
msle_test =metrics.mean_squared_log_error(y_test, pred_y_test)

print(f"Train r2 = {r2_train:.2f} \nTest r2 = {r2_test:.2f}")
print(f"Train msle = {msle_train:.2f} \nTest msle = {msle_test:.2f}")

Train r2 = 0.97 
Test r2 = 0.91
Train msle = 0.01 
Test msle = 0.02


In [69]:
from sklearn.model_selection import GridSearchCV

params = [{"max_depth":[3, 5], 
            "n_estimators" : [50, 100], 
            "learning_rate" : [0.1, 0.2],
            "min_child_samples" : [20, 10]}]

gs_knn = GridSearchCV(model,
                      param_grid=params,
                      cv=5)

gs_knn.fit(x_train, y_train)
gs_knn.score(x_train, y_train)

pred_y_train = model.predict(x_train)
pred_y_test = model.predict(x_test)

r2_train = metrics.r2_score(y_train, pred_y_train)
r2_test = metrics.r2_score(y_test, pred_y_test)

msle_train =metrics.mean_squared_log_error(y_train, pred_y_train)
msle_test =metrics.mean_squared_log_error(y_test, pred_y_test)

print(f"Train r2 = {r2_train:.2f} \nTest r2 = {r2_test:.2f}")
print(f"Train msle = {msle_train:.2f} \nTest msle = {msle_test:.2f}")

gs_knn.best_params_


Train r2 = 0.97 
Test r2 = 0.91
Train msle = 0.01 
Test msle = 0.02


{'learning_rate': 0.2,
 'max_depth': 3,
 'min_child_samples': 10,
 'n_estimators': 100}

In [70]:
# save the model to disk
import pickle

filename = 'finalized_model.model'
pickle.dump(model, open(filename, 'wb'))

In [73]:
# Load the model from disk
import pickle

filename = 'finalized_model.model'

loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)

# predict
print(loaded_model.predict(inputDf))


0.9065596642357174
[159120.15487566]
