# Notebook 2: Baseline Model and Performance

In [210]:
import pandas as pd
import numpy as np
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

#your info here
__author__ = "Vusal Babashov"
__email__ = "vbabashov@gmail.com"
__website__ = 'https://vbabashov.github.io'

In [211]:
nominal = ['MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 
                   'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
                   'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                   'Foundation', 'Heating', 'CentralAir', 'GarageType', 'MoSold',
                   'SaleType', 'SaleCondition'] # removed Alley, MiscFeature, 

ordinal = ['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 
                   'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                   'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'Electrical', 'KitchenQual', 
                   'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                   'PavedDrive'] #removed PoolQC, Fence,


numeric = ['Id','LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1',
                  'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                  'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'TotRmsAbvGrd',
                  'Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch',
                  '3SsnPorch','ScreenPorch','PoolArea','MiscVal', 'GarageYrBlt', 'YrSold', 'SalePrice']

categorical = nominal+ordinal

# Ordinal Category Values
lot_shape = ['IR3','IR2','IR1','Reg']
utilities = ['ELO', 'NoSeWa', 'NoSewr','AllPub']
land_slope = ['Sev','Mod','Gtl']
overall_qual = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # already in the ordinal structure
overall_cond = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # already in the ordinal structure
exter_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
exter_cond = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_qual  = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_cond  = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_exposure  = ['NA', 'No', 'Mn', 'Av', 'Gd']
bsmt_fin_type1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
bsmt_fin_type2 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
heating_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
electrical = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr']
kitchen_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
functional = ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']
fire_place_qual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_finish = ['NA', 'Unf', 'RFn', 'Fin']
garage_qual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_cond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
paved_drive = ['N', 'P', 'Y']

ordinal_categories_list = [lot_shape , utilities, land_slope, overall_qual, overall_cond, exter_qual, exter_cond, bsmt_qual, 
                          bsmt_cond, bsmt_exposure, bsmt_fin_type1, bsmt_fin_type2, heating_qual, electrical, kitchen_qual,
                          functional, fire_place_qual, garage_finish, garage_qual, garage_cond, paved_drive]  

In [212]:
def log_transform (df):
    '''This function performs the log transformation of the target'''
    df['SalePrice'] = np.log(df['SalePrice'])   
    return df
    
def drop_missing_cols_df (df):
    '''Identifies and drops the columns with 80% or hihgher proportion of missing data '''
    dropped_cols = []  
    for col in df.columns:
        if df[col].isnull().sum()/df.shape[0] >= 0.8:
            dropped_cols.append(col)
    dropped_df=df.drop(columns=dropped_cols)
    return dropped_df, dropped_cols  


def impute_missing_values (df, categorical_features, numeric_features):
    ''' Imputes the continious columns with median and categorical columns with the mode value'''
    imputer_con = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    for col in categorical_features+numeric_features:
        if df[col].isnull().sum() > 0:    
            if col in categorical_features:              
                df[col] = imputer_cat.fit_transform(df[col].values.reshape(-1,1))
            elif col in numeric_features:  
                df[col] = imputer_con.fit_transform(df[col].values.reshape(-1,1))
    return df  


def convert_month_string (df):
    '''This function maps the numerical month names into string month names'''
    d = { 1 : 'Jan',
          2 : 'Feb',
          3 : 'Mar',
          4 : 'Apr',
          5 : 'May',
          6 : 'June',
          7 : 'July',
          8 : 'Aug',
          9 : 'Sep',
          10: 'Oct',
          11: 'Nov',
          12: 'Dec'
    }
    df['MoSold'] = df ['MoSold'].map(d)
    return df


def ordinal_hot_encoding (df, nom_cols, ord_cols, ordinal_categories_list):
    '''This function encodes nominal variables into one-hot encoding and ordinal variables into ordinal encoding'''
    ord_encoder = OrdinalEncoder(categories=ordinal_categories_list)
    df[ord_cols] = ord_encoder.fit_transform(df[ord_cols])
    df = pd.get_dummies (df, drop_first=True, columns = nom_cols)
    return df

In [213]:
data_train_raw = pd.read_csv("/Users/vusalbabashov/Desktop/house-prices/data/raw/train.csv") # load the data

In [214]:
transformed_df = log_transform(data_train_raw) # log transform
clean_df, columns = drop_missing_cols_df (transformed_df) # drop the missing columns
imputed_df = impute_missing_values (clean_df, categorical, numeric)  #impute missing values 
encoded_df = ordinal_hot_encoding(convert_month_string(imputed_df), nominal, ordinal, ordinal_categories_list) # one hot and ordinal encoding

In [215]:
X_train, X_test, y_train, y_test = train_test_split(encoded_df.drop(['Id', 'SalePrice'], axis=1),encoded_df['SalePrice'],
                                                    test_size=0.2,
                                                    random_state=0)

X_train.shape, X_test.shape

((1168, 213), (292, 213))

In [216]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print ('\n MAE  for the Baseline Model: %.2f'%  mean_absolute_error(np.exp(y_test), np.exp(y_pred)))
print (' RMSE for the Baseline Model : %.2f'%   mean_squared_error(np.exp(y_test),  np.exp(y_pred), squared = False))


 MAE  for the Baseline Model: 24139.18
 RMSE for the Baseline Model : 149478.70
