In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


In [3]:
#import data
df = pd.read_csv('/home/kolade/repos/house-prediction/dataset/house_prices_data.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df_copy = df.copy()

In [5]:
#split data
train_X, test_X, train_y, test_y = train_test_split(df, df.SalePrice, test_size=0.15, random_state =42,shuffle = True)

In [6]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((1241, 81), (219, 81), (1241,), (219,))

In [7]:
#import selected features
selected_features_series =  pd.read_csv('/home/kolade/repos/house-prediction/dataset/selected_features.csv', header=None)[0]
list_selected_feats=list(selected_features_series.values)
list_selected_feats.append('LotFrontage')

In [8]:
# missing values for categorical features, accounting for the generated feature hasFireplaces
missing_cat_vars = [i for i in list_selected_feats if i != 'hasFireplaces' and train_X[i].isnull().sum()>1 and train_X[i].dtypes=='O']

In [9]:
missing_cat_vars

['BsmtQual', 'GarageCond', 'GarageQual', 'GarageFinish', 'FireplaceQu']

In [10]:
def fill_categorical_na(data, cat_missing_cols):
    """ Creates another category in categorical features by filling missing values with 'MissingValue' """
    df_copy = data.copy()
    df_copy[cat_missing_cols] = data[cat_missing_cols].fillna("MissingValue") 
    return df_copy

In [11]:
#fill categorical variables
train_X = fill_categorical_na(train_X,missing_cat_vars)
test_X = fill_categorical_na(test_X,missing_cat_vars)

In [12]:
#verify
display(train_X[missing_cat_vars].isnull().sum())
print("----Test")
display(test_X[missing_cat_vars].isnull().sum())

BsmtQual        0
GarageCond      0
GarageQual      0
GarageFinish    0
FireplaceQu     0
dtype: int64

----Test


BsmtQual        0
GarageCond      0
GarageQual      0
GarageFinish    0
FireplaceQu     0
dtype: int64

In [13]:
# get missing numerical variable takinh account of generated feature hasFireplaces
missing_num_vars = [i for i in list_selected_feats if i != 'hasFireplaces' and train_X[i].isnull().sum()>1 and train_X[i].dtypes!='O']

In [14]:
#fill numeric missing value with median 
missing_feats_median = {}
for i in missing_num_vars:
        median = train_X[i].median()
        missing_feats_median[i] = median

        train_X[i]= train_X[i].fillna(median)
        
        test_X[i]= test_X[i].fillna(median)

In [15]:
missing_feats_median

{'LotFrontage': 70.0}

In [None]:
#persist for production
np.save('/home/kolade/repos/house-prediction/persist/dict_median.npy',missing_feats_median)

In [18]:
train_X[missing_num_vars].isnull().sum()

LotFrontage    0
dtype: int64

In [20]:
cat_vars = [i for i in list_selected_feats if i != 'hasFireplaces' and train_X[i].dtypes=='O']

In [25]:
cat_vars

['ExterQual',
 'BsmtQual',
 'Neighborhood',
 'KitchenQual',
 'GarageCond',
 'GarageQual',
 'GarageFinish',
 'CentralAir',
 'FireplaceQu',
 'LandContour']

In [24]:
#function to capture rare variable
def frequent_values(data,cat_col,value):
    """ Returns the categories for each feature greater with than 1% observation in the feature"""
    data_copy = data.copy()
    df = data_copy[[cat_col,'SalePrice']].groupby(cat_col).count()/len(data_copy)
    return df[df["SalePrice"]>value].index

In [None]:
frequent_label_dict = {}
for i in cat_vars:
    #note that this is done on the train_set
    frequent_data = frequent_values(train_X,i, 0.01)
    frequent_label_dict[i] = frequent_data
    train_X[i] = np.where(train_X[i].isin(rare_data), train_X[i], 'Rare')
    test_X[i] = np.where(test_X[i].isin(rare_data), test_X[i], 'Rare')