# load modules and functions, hide warnings

In [3]:
import numpy as np
import pandas as pd

from scipy import stats
from scipy.stats import norm, skew
import matplotlib.pyplot as plt
import seaport as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

# load dataset

In [4]:
train = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")

# data wrangling:

In [5]:
# Deleting outliers
#train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

# extract labels and index
length = len(train)
train_id = train["Id"]
test_id = test["Id"]
train_label = train["SalePrice"]

train = train.drop(columns = "SalePrice")

full = pd.concat([train, test], axis = 0).reset_index()

# based on dataset description:
- "MSSubClass" column actually contains parallel levels, it needs to be converted to categorical;
- "Alley", "FireplaceQu", "PoolQC", "Fence" and "MiscFeature" columns contain few information, which need to be dropped;
- "LotShape", "Utilities", "LandSlope", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "CentralAir", "Electrical", "KitchenQual", "Functional", "GarageFinish", "GarageQual" and "GarageCond" columns contain levels, which need to be converted to numeric.
- years in any kind need to convert to categorical.

then all the numerical columns are normalize, all the categorical columns are one hot encoded.

In [11]:
def process(x):
    # there are several columns are almost empty, remove them. also remove ID
    x = x.drop(columns=["Id", "Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"])
    # "MSSubClass" actually have parallel levels. turn it to categorical
    x["MSSubClss"] = x["MSSubClass"].astype("object")
    
    for i in range(len(x["LotShape"])):
        if x["LotShape"][i] == "Reg":
            x["LotShape"][i] = 3
        elif x["LotShape"][i] == "IR1":
            x["LotShape"][i] = 2
        elif x["LotShape"][i] == "IR2":
            x["LotShape"][i] = 1
        else:
            x["LotShape"][i] = 0
        
    for i in range(len(x["Utilities"])):
        if x["Utilities"][i] == "AllPub":
            x["Utilities"][i] = 3
        elif x["Utilities"][i] == "NoSewr":
            x["Utilities"][i] = 2
        elif x["Utilities"][i] =="NoSeWa":
            x["Utilities"][i] = 2
        else:
            x["Utilities"][i] = 1

    for i in range(len(x["LandSlope"])):
        if x["LandSlope"][i] == "Gtl":
            x["LandSlope"][i] = 2
        elif x["LandSlope"][i] == "Mod":
            x["LandSlope"][i] = 1
        else:
            x["LandSlope"][i] = 0
        
    for i in range(len(x["ExterQual"])):
        if x["ExterQual"][i] == "Ex":
            x["ExterQual"][i] = 4
        elif x["ExterQual"][i] == "Gd":
            x["ExterQual"][i] = 3
        elif x["ExterQual"][i] == "TA":
            x["ExterQual"][i] = 2
        elif x["ExterQual"][i] == "Fa":
            x["ExterQual"][i] = 1
        else:
            x["ExterQual"][i] = 0
        
    for i in range(len(x["ExterCond"])):
        if x["ExterCond"][i] == "Ex":
            x["ExterCond"][i] = 4
        elif x["ExterCond"][i] == "Gd":
            x["ExterCond"][i] = 3
        elif x["ExterCond"][i] == "TA":
            x["ExterCond"][i] = 2
        elif x["ExterCond"][i] == "Fa":
            x["ExterCond"][i] = 1
        else:
            x["ExterCond"][i] = 0
        
    for i in range(len(x["BsmtQual"])):
        if x["BsmtQual"][i] == "Ex":
            x["BsmtQual"][i] = 5
        elif x["BsmtQual"][i] == "Gd":
            x["BsmtQual"][i] = 4
        elif x["BsmtQual"][i] == "TA":
            x["BsmtQual"][i] = 3
        elif x["BsmtQual"][i] == "Fa":
            x["BsmtQual"][i] = 2
        elif x["BsmtQual"][i] == "Po":
            x["BsmtQual"][i] = 1
        else:
            x["BsmtQual"][i] = 0
        
    for i in range(len(x["BsmtCond"])):
        if x["BsmtCond"][i] == "Ex":
            x["BsmtCond"][i] = 5
        elif x["BsmtCond"][i] == "Gd":
            x["BsmtCond"][i] = 4
        elif x["BsmtCond"][i] == "TA":
            x["BsmtCond"][i] = 3
        elif x["BsmtCond"][i] == "Fa":
            x["BsmtCond"][i] = 2
        elif x["BsmtCond"][i] == "Po":
            x["BsmtCond"][i] = 1
        else:
            x["BsmtCond"][i] = 0

    for i in range(len(x["BsmtExposure"])):
        if x["BsmtExposure"][i] == "Gd":
            x["BsmtExposure"][i] = 4
        elif x["BsmtExposure"][i] == "Av":
            x["BsmtExposure"][i] = 3
        elif x["BsmtExposure"][i] == "Mn":
            x["BsmtExposure"][i] = 2
        elif x["BsmtExposure"][i] == "No":
            x["BsmtExposure"][i] = 1
        else:
            x["BsmtExposure"][i] = 0

    for i in range(len(x["BsmtFinType1"])):
        if x["BsmtFinType1"][i] == "GLQ":
            x["BsmtFinType1"][i] = 6
        elif x["BsmtFinType1"][i] == "ALQ":
            x["BsmtFinType1"][i] = 5
        elif x["BsmtFinType1"][i] == "BLQ":
            x["BsmtFinType1"][i] = 4
        elif x["BsmtFinType1"][i] == "Rec":
            x["BsmtFinType1"][i] = 3
        elif x["BsmtFinType1"][i] == "LwQ":
            x["BsmtFinType1"][i] = 2
        elif x["BsmtFinType1"][i] == "Unf":
            x["BsmtFinType1"][i] = 1
        else:
            x["BsmtFinType1"][i] = 0

    for i in range(len(x["BsmtFinType2"])):
        if x["BsmtFinType2"][i] == "GLQ":
            x["BsmtFinType2"][i] = 6
        elif x["BsmtFinType2"][i] == "ALQ":
            x["BsmtFinType2"][i] = 5
        elif x["BsmtFinType2"][i] == "BLQ":
            x["BsmtFinType2"][i] = 4
        elif x["BsmtFinType2"][i] == "Rec":
            x["BsmtFinType2"][i] = 3
        elif x["BsmtFinType2"][i] == "LwQ":
            x["BsmtFinType2"][i] = 2
        elif x["BsmtFinType2"][i] == "Unf":
            x["BsmtFinType2"][i] = 1
        else:
            x["BsmtFinType2"][i] = 0

    for i in range(len(x["HeatingQC"])):
        if x["HeatingQC"][i] == "Ex":
            x["HeatingQC"][i] = 4
        elif x["HeatingQC"][i] == "Gd":
            x["HeatingQC"][i] = 3
        elif x["HeatingQC"][i] == "TA":
            x["HeatingQC"][i] = 2
        elif x["HeatingQC"][i] == "Fa":
            x["HeatingQC"][i] = 1
        else:
            x["HeatingQC"][i] = 0

    for i in range(len(x["CentralAir"])):
        if x["CentralAir"][i] == "N":
            x["CentralAir"][i] = 0
        else:
            x["CentralAir"][i] = 1

    for i in range(len(x["Electrical"])):
        if x["Electrical"][i] == "SBrkr":
            x["Electrical"][i] = 4
        elif x["Electrical"][i] == "FuseA":
            x["Electrical"][i] = 3
        elif x["Electrical"][i] == "FuseF":
            x["Electrical"][i] = 2
        elif x["Electrical"][i] == "FuseP":
            x["Electrical"][i] = 1
        else:
            x["Electrical"][i] = 0

    for i in range(len(x["KitchenQual"])):
        if x["KitchenQual"][i] == "Ex":
            x["KitchenQual"][i] = 4
        elif x["KitchenQual"][i] == "Gd":
            x["KitchenQual"][i] = 3
        elif x["KitchenQual"][i] == "TA":
            x["KitchenQual"][i] = 2
        elif x["KitchenQual"][i] == "Fa":
            x["KitchenQual"][i] = 1
        else:
            x["KitchenQual"][i] = 0

    for i in range(len(x["Functional"])):
        if x["Functional"][i] == "Typ":
            x["Functional"][i] = 7
        elif x["Functional"][i] == "Min1":
            x["Functional"][i] = 6
        elif x["Functional"][i] == "Min2":
            x["Functional"][i] = 5
        elif x["Functional"][i] == "Mod":
            x["Functional"][i] = 4
        elif x["Functional"][i] == "Maj1":
            x["Functional"][i] = 3
        elif x["Functional"][i] == "Maj2":
            x["Functional"][i] = 2
        elif x["Functional"][i] == "Sev":
            x["Functional"][i] = 1
        else:
            x["Functional"][i] = 0

    for i in range(len(x["GarageFinish"])):
        if x["GarageFinish"][i] == "Fin":
            x["GarageFinish"][i] = 3
        elif x["GarageFinish"][i] == "RFn":
            x["GarageFinish"][i] = 2
        elif x["GarageFinish"][i] == "Unf":
            x["GarageFinish"][i] = 1
        else:
            x["GarageFinish"][i] = 0

    for i in range(len(x["GarageQual"])):
        if x["GarageQual"][i] == "Ex":
            x["GarageQual"][i] = 5
        elif x["GarageQual"][i] == "Gd":
            x["GarageQual"][i] = 4
        elif x["GarageQual"][i] == "TA":
            x["GarageQual"][i] = 3
        elif x["GarageQual"][i] == "Fa":
            x["GarageQual"][i] = 2
        elif x["GarageQual"][i] == "Po":
            x["GarageQual"][i] = 1
        else:
            x["GarageQual"][i] = 0

    for i in range(len(x["GarageCond"])):
        if x["GarageCond"][i] == "Ex":
            x["GarageCond"][i] = 5
        elif x["GarageCond"][i] == "Gd":
            x["GarageCond"][i] = 4
        elif x["GarageCond"][i] == "TA":
            x["GarageCond"][i] = 3
        elif x["GarageCond"][i] == "Fa":
            x["GarageCond"][i] = 2
        elif x["GarageCond"][i] == "Po":
            x["GarageCond"][i] = 1
        else:
            x["GarageCond"][i] = 0
    
    # convert these columns to numeric
    columns = ["LotShape", "Utilities", "LandSlope", "ExterQual", "ExterCond",
          "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "KitchenQual",
          "BsmtFinType2", "HeatingQC", "CentralAir", "Electrical",
          "Functional", "GarageFinish", "GarageQual", "GarageCond"]
    category = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", "MoSold", "YrSold"]
    x[columns] = x[columns].astype("int")
    x[category] = x[category].astype("object")
    
    # select categorical and numerical column names
    categorical = list(x.select_dtypes(include="object").columns)
    numeric = [col for col in x.columns if col not in categorical]
    
    # select categorical and numerical columns
    cat = x[categorical]
    num = x[numeric]

    # impute cateorical data
    imp_cat = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
    cat_imp = imp_cat.fit_transform(cat)
    ohc = OneHotEncoder(sparse=False)
    cat_ready = pd.DataFrame(ohc.fit_transform(cat_imp))

    # impute numerical data
    imp_num = SimpleImputer(missing_values=np.NaN, strategy="mean")
    num_imp = imp_num.fit_transform(num)
    mms = MinMaxScaler()
    num_ready = pd.DataFrame(mms.fit_transform(num_imp))

    data_ready = pd.concat([cat_ready, num_ready], axis=1)
    
    return data_ready

In [None]:
full = full.replace({"Street":{"Grvl":0, 'Pave':1},
                    "Alley":{"NA":0, "Grvl":1, "Pave":2},
                    "LotShape":{"Reg":3,"IR1":2,"IR2":1,"IR3":0},
                    "Utilities":{"AllPub":3,"NoSewr":2,:"NoSeWa":1,"ELO":0},
                    "LandSlope":{"Gtl":2,'Mod':1,"Sev":0}})

# process data

In [12]:
full_processed = process(full)


In [20]:
train_pro = full_processed[:length]
test_pro = full_processed[length:]

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


# define a funcion to calculate RMSE

In [14]:
def rmse(true, test):
    rmse = np.sqrt(mean_squared_error(true, test))
    return rmse

# split data

In [23]:
X_train, y_train, X_test, y_test = train_test_split(train_pro, train_label, test_size=0.2, random_state=1988)

# classic linear regression

In [24]:
lr = LinearRegression()
lr.fit(X_train, X_test)
pred_lr = lr.predict(y_train)
print(rmse(y_test, pred_lr))

1.6466077252159102e+16


# lasso regression with grid of alpha values

In [25]:
param_grid = {"alpha":np.linspace(100, 10000, 10)}

ls = Lasso()
grid_ls = GridSearchCV(ls, param_grid=param_grid, cv=10)
grid_ls.fit(X_train, X_test)
pred_ls = grid_ls.predict(y_train)
print(grid_ls.best_params_)
print(grid_ls.best_score_)
print(rmse(y_test, pred_ls))

{'alpha': 100.0}
0.7881346589500677
24038.52154239867


# ridge regression with grid of alpha values

In [26]:
rg = Ridge()
grid_rg = GridSearchCV(rg, param_grid=param_grid, cv=10)
grid_rg.fit(X_train, X_test)
pred_rg = grid_rg.predict(y_train)
print(grid_rg.best_params_)
print(grid_rg.best_score_)
print(rmse(y_test, pred_rg))

{'alpha': 100.0}
0.728829575684029
37598.76881586574


# lasso regression outperforms other regression.

# XGBoost:

In [1]:
xgb = XGBRegressor()
xgb.fit(X_train, X_test)
pred_xgb = xgb.predict(y_train)
print(rmse(y_test, pred_xgb))

NameError: name 'XGBRegressor' is not defined

# run prediction on test data.