# Relevant Imports

In [1]:
import numpy as np
import pandas as pd
import glob
import os
import xgboost
import csv as csv
#import ipychart as ipc
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import cross_val_score,KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# from sklearn.grid_search import GridSearchCV   #Perforing grid search
from scipy.stats import skew
from collections import OrderedDict

In [2]:
dataFrame = pd.read_csv("./CombinedData.csv", header = 0, low_memory=False)
dataFrame.insert(0, 'ID', range(0,len(dataFrame)))
dataFrame = dataFrame.sample(frac=1).reset_index(drop=True)
print(dataFrame.head())

      ID         SOLD DATE              PROPERTY TYPE  \
0  59483   January-31-2022  Single Family Residential   
1  57536               NaN  Single Family Residential   
2   1965  February-11-2022                Condo/Co-op   
3  65389    August-31-2022  Single Family Residential   
4  24940        May-4-2022  Single Family Residential   

                   ADDRESS         CITY STATE OR PROVINCE ZIP OR POSTAL CODE  \
0          179 Shumaker Dr  Statesville                NC              28625   
1     902 S Williamson Ave         Elon                NC              27244   
2  300 Orchard Trace Ln #2    Charlotte                NC              28213   
3           4240 Oakton Dr   High Point                NC              27265   
4          25 Muirfield Dr        Arden                NC              28704   

    PRICE  BEDS  BATHS         LOCATION  SQUARE FEET  LOT SIZE  YEAR BUILT  \
0  205000   2.0    1.0  Iredell Heights       1200.0   41382.0      1959.0   
1  220000   3.0    1

In [3]:
dataFrame = dataFrame[dataFrame['SQUARE FEET'].notna()]
dataFrame = dataFrame[dataFrame['SOLD DATE'].notna()]
dataFrame = dataFrame[dataFrame['ZIP OR POSTAL CODE'].notna()]
dataFrame = dataFrame[dataFrame['YEAR BUILT'].notna()]
#dataFrame = dataFrame[dataFrame['LOCATION'].notna()]
dataFrame['SOLD DATE'] = pd.to_datetime(dataFrame['SOLD DATE']).apply(lambda x: x.value)

# Normalizing DataFrame Data

In [4]:
dataFrame['ZIP OR POSTAL CODE'] = pd.Series([int(x.split('-')[0].split(' ')[0]) if '-' in x or ' ' in x else int(x) for x in dataFrame['ZIP OR POSTAL CODE']])

dataFrame = dataFrame.drop(columns=["URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)", 
                                   'CITY', 'STATE OR PROVINCE', 'ADDRESS', '$/SQUARE FEET'])
# dataFrame['ZIP OR POSTAL CODE'] = dataFrame['ZIP OR POSTAL CODE'].astype(int)
df = dataFrame.copy(deep=True)

In [5]:
h = dataFrame['PRICE'].quantile(0.99)
l = dataFrame['PRICE'].quantile(0.01)
dataFrame = dataFrame[(dataFrame["PRICE"] < h) & (dataFrame["PRICE"] > l)]

In [6]:
category_features = ['PROPERTY TYPE', 'LOCATION']
labels = dataFrame['PRICE']
training = dataFrame.drop(['ID'], axis = 1)

# Manipulating non-normalized dataframe

In [7]:
df['SOLD DATE'] = pd.to_datetime(df['SOLD DATE'])

In [8]:
nan_features = ['LOCATION']
def ConvertToNAString(data, columnsList):
    for x in columnsList:
        data[x].apply(lambda x: x if x else 'NA')
ConvertToNAString(training, nan_features)

In [9]:
training.columns

Index(['SOLD DATE', 'PROPERTY TYPE', 'ZIP OR POSTAL CODE', 'PRICE', 'BEDS',
       'BATHS', 'LOCATION', 'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT',
       'HOA/MONTH', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [10]:
training['LOCATION']

0           Iredell Heights
2             Orchard Trace
4                 Brookwood
5        Ridgehaven Estates
7             Eagle Heights
                ...        
79030       Western Heights
79031             Swannanoa
79032     Chancellors Ridge
79034           Alder Ridge
79035            The Valley
Name: LOCATION, Length: 42945, dtype: object

In [11]:
non_categorical_columns = [col for col in training.columns if col not in category_features and col not in ['ID']]
numeric_features = training[non_categorical_columns].dtypes[training.dtypes != "object"].index
training[numeric_features] = np.log1p(training[numeric_features])

In [12]:
training = pd.get_dummies(training, columns=category_features)

In [13]:
training.columns

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'PRICE', 'BEDS', 'BATHS',
       'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE',
       ...
       'LOCATION_vermillion', 'LOCATION_villages at Rosedale',
       'LOCATION_waters Edge', 'LOCATION_west asheville', 'LOCATION_wexford',
       'LOCATION_white oaks estates', 'LOCATION_windsor grove',
       'LOCATION_windstone Crossing', 'LOCATION_windy hill farms',
       'LOCATION_woodland hills'],
      dtype='object', length=6730)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(training, labels, test_size=.3)
train_dataset = X_train
test_dataset = X_test
# train_dataset = pd.get_dummies(X_train,columns=category_features)
# test_dataset = pd.get_dummies(X_test,columns=category_features)

every_column_except_y= [col for col in train_dataset.columns if col not in ['PRICE','ID']]
train_X = train_dataset[every_column_except_y]
every_column_except_y= [col for col in test_dataset.columns if col not in ['PRICE','ID']]
test_X = test_dataset[every_column_except_y]
train_Y = train_dataset['PRICE']
test_Y = test_dataset['PRICE']

In [15]:


# non_categorical_columns = [col for col in test_X.columns if col not in category_features and col not in ['ID']]
# numeric_features = test_X[non_categorical_columns].dtypes[test_X.dtypes != "object"].index
# test_X[numeric_features] = np.log1p(test_X[numeric_features])

In [16]:
train_Y

28973    13.165812
18323    13.790194
38055    13.151924
32980    12.160034
61262    13.910822
           ...    
51659    12.736410
22295    13.028055
7778     12.524530
32972    12.350177
54069    12.345839
Name: PRICE, Length: 30061, dtype: float64

In [17]:
train_X.columns

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'BEDS', 'BATHS', 'SQUARE FEET',
       'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE', 'LONGITUDE',
       ...
       'LOCATION_vermillion', 'LOCATION_villages at Rosedale',
       'LOCATION_waters Edge', 'LOCATION_west asheville', 'LOCATION_wexford',
       'LOCATION_white oaks estates', 'LOCATION_windsor grove',
       'LOCATION_windstone Crossing', 'LOCATION_windy hill farms',
       'LOCATION_woodland hills'],
      dtype='object', length=6729)

In [18]:
def createColumnPerValue(data, columnsList):
  for x in columnsList:
    values = pd.unique(data[x])

    for y in values: 
      column_name = x + "_" + str(y)
      data[column_name]=(data[x]==y).astype(float)
    
    data.drop(x, axis=1, inplace=True)



# Establish ML model

In [19]:
 model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 tree_method='gpu_hist', 
                 gpu_id=0,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 
# model = xgboost.XGBRegressor()
# model.load_model("CurrentModel.json")

In [20]:
#df['time'] = df['time'].apply(lambda x: x.value)
print(train_dataset.columns)

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'PRICE', 'BEDS', 'BATHS',
       'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE',
       ...
       'LOCATION_vermillion', 'LOCATION_villages at Rosedale',
       'LOCATION_waters Edge', 'LOCATION_west asheville', 'LOCATION_wexford',
       'LOCATION_white oaks estates', 'LOCATION_windsor grove',
       'LOCATION_windstone Crossing', 'LOCATION_windy hill farms',
       'LOCATION_woodland hills'],
      dtype='object', length=6730)


In [21]:
print(y_train)

28973     522202
18323     975000
38055     515000
32980     191000
61262    1100000
          ...   
51659     339900
22295     455000
7778      275000
32972     231000
54069     230000
Name: PRICE, Length: 30061, dtype: int64


In [22]:
# every_column_except_y= [col for col in X_train.columns if col not in ['PRICE','ID']]
model.fit(train_X, train_Y)
OrderedDict(sorted(model.get_booster().get_fscore().items(), key=lambda t: t[1], reverse=True))

OrderedDict([('LATITUDE', 8297.0),
             ('LOT SIZE', 7686.0),
             ('YEAR BUILT', 7644.0),
             ('SQUARE FEET', 6900.0),
             ('SOLD DATE', 6178.0),
             ('ZIP OR POSTAL CODE', 5595.0),
             ('HOA/MONTH', 4110.0),
             ('BATHS', 2754.0),
             ('BEDS', 1820.0),
             ('LOCATION_Mooresville', 471.0),
             ('LOCATION_Asheville', 400.0),
             ('PROPERTY TYPE_Single Family Residential', 367.0),
             ('LOCATION_Durham', 339.0),
             ('LOCATION_Johnston', 329.0),
             ('PROPERTY TYPE_Multi-Family (2-4 Unit)', 258.0),
             ('LOCATION_Chatham', 243.0),
             ('PROPERTY TYPE_Condo/Co-op', 242.0),
             ('LOCATION_Gastonia', 202.0),
             ('LOCATION_Weaverville', 201.0),
             ('LOCATION_Statesville', 190.0),
             ('LOCATION_Alamance', 181.0),
             ('PROPERTY TYPE_Townhouse', 173.0),
             ('LOCATION_Salisbury', 172.0),
         

In [36]:
model.save_model("CurrentModel.json")

In [37]:
most_relevant_features= list( dict((k, v) for k, v in model.get_booster().get_fscore().items() if v >= 10).keys())
print(most_relevant_features)

['SOLD DATE', 'ZIP OR POSTAL CODE', 'BEDS', 'BATHS', 'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE', 'PROPERTY TYPE_Condo/Co-op', 'PROPERTY TYPE_Mobile/Manufactured Home', 'PROPERTY TYPE_Multi-Family (2-4 Unit)', 'PROPERTY TYPE_Multi-Family (5+ Unit)', 'PROPERTY TYPE_Single Family Residential', 'PROPERTY TYPE_Townhouse', 'LOCATION_*', 'LOCATION_100 Norman Place', 'LOCATION_Afton Village', 'LOCATION_Alamance', 'LOCATION_Alexander', 'LOCATION_Apple Creek', 'LOCATION_Arden', 'LOCATION_Asheboro', 'LOCATION_Ashecroft', 'LOCATION_Asheville', 'LOCATION_Asheville Arras Residence', 'LOCATION_Asheville West', 'LOCATION_Ashley Woods', 'LOCATION_Ashton Manor', 'LOCATION_Barclay Downs', 'LOCATION_Barnardsville', 'LOCATION_Beaucatcher Heights', 'LOCATION_Beaucatcher House', 'LOCATION_Beaver Creek', 'LOCATION_Beaverdam', 'LOCATION_Belmont', 'LOCATION_Bessemer City', 'LOCATION_Beverly Hills', 'LOCATION_Beverly Woods', 'LOCATION_Biddleville', 'LOCATION_Biltmore Forest', 'LOCATION_Bilt

In [39]:
#y_pred = model.predict(test_X)
y_pred = np.expm1(model.predict(test_X))
predictions = [round(value) for value in y_pred]
print(predictions)

[314333, 306472, 441185, 163780, 444265, 366357, 500162, 454160, 513071, 617925, 210782, 295148, 578939, 341134, 506517, 354305, 206868, 289280, 460609, 502559, 292924, 368035, 462100, 581374, 531775, 292827, 376408, 229559, 741702, 337439, 284661, 334587, 368862, 174472, 412139, 224245, 587561, 213911, 405843, 251410, 355369, 253852, 432515, 501891, 372412, 280633, 257654, 464441, 395797, 840091, 504731, 508214, 184144, 675393, 373607, 328891, 1223303, 252809, 216206, 545136, 405480, 238647, 411583, 290868, 228818, 473902, 479455, 290664, 335919, 329134, 461997, 448143, 494042, 203806, 246509, 554593, 798791, 546862, 150623, 310827, 483614, 398604, 335204, 196406, 764803, 225100, 389830, 484584, 295233, 291348, 356276, 437869, 564741, 313540, 303014, 437224, 848922, 397491, 343517, 250512, 351837, 516370, 142175, 567999, 218580, 327297, 355538, 391777, 219348, 189408, 276029, 396861, 430469, 219242, 498692, 381842, 342739, 144313, 274150, 213085, 614691, 434145, 305689, 390895, 229383

In [26]:
y_pred

array([12.658212, 12.632885, 12.997223, ..., 13.301534, 13.056457,
       12.778801], dtype=float32)

In [42]:
test_Y = np.expm1(test_Y)

In [43]:
errlist = []
ty = list(test_Y)
for i, v in enumerate(predictions):
    errlist.append(abs(ty[i]-predictions[i]))

## Mean Prediction Error in USD

In [44]:
sum(errlist)/len(errlist)

58090.810152126665

In [30]:
predictions[2]

13

In [31]:
test_Y.head()

44338    12.528160
20265    12.861001
53991    12.864954
5270     11.898195
34645    13.159661
Name: PRICE, dtype: float64

In [32]:
test_Y

44338    12.528160
20265    12.861001
53991    12.864954
5270     11.898195
34645    13.159661
           ...    
78439    11.695255
21978    13.795309
8979     12.206078
76669    13.224236
6720     12.929994
Name: PRICE, Length: 12884, dtype: float64

In [33]:
test_X.columns

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'BEDS', 'BATHS', 'SQUARE FEET',
       'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE', 'LONGITUDE',
       ...
       'LOCATION_vermillion', 'LOCATION_villages at Rosedale',
       'LOCATION_waters Edge', 'LOCATION_west asheville', 'LOCATION_wexford',
       'LOCATION_white oaks estates', 'LOCATION_windsor grove',
       'LOCATION_windstone Crossing', 'LOCATION_windy hill farms',
       'LOCATION_woodland hills'],
      dtype='object', length=6729)

In [34]:
train_X.columns

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'BEDS', 'BATHS', 'SQUARE FEET',
       'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE', 'LONGITUDE',
       ...
       'LOCATION_vermillion', 'LOCATION_villages at Rosedale',
       'LOCATION_waters Edge', 'LOCATION_west asheville', 'LOCATION_wexford',
       'LOCATION_white oaks estates', 'LOCATION_windsor grove',
       'LOCATION_windstone Crossing', 'LOCATION_windy hill farms',
       'LOCATION_woodland hills'],
      dtype='object', length=6729)

In [35]:
test_Y

44338    12.528160
20265    12.861001
53991    12.864954
5270     11.898195
34645    13.159661
           ...    
78439    11.695255
21978    13.795309
8979     12.206078
76669    13.224236
6720     12.929994
Name: PRICE, Length: 12884, dtype: float64