# Relevant Imports

In [1]:
import numpy as np
import pandas as pd
import glob
import os
import xgboost
import csv as csv
import ipychart as ipc
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import cross_val_score,KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# from sklearn.grid_search import GridSearchCV   #Perforing grid search
from scipy.stats import skew
from collections import OrderedDict

In [2]:
dataFrame = pd.read_csv("./CombinedData.csv", header = 0, low_memory=False)
dataFrame.insert(0, 'ID', range(0,len(dataFrame)))
dataFrame = dataFrame.sample(frac=1).reset_index(drop=True)
print(dataFrame.head())

      ID      SOLD DATE              PROPERTY TYPE               ADDRESS  \
0  26439   June-29-2022  Single Family Residential  23 Camelfield Rd #21   
1   7992            NaN  Single Family Residential    2213 Crosswinds Ct   
2  29480  March-17-2022  Single Family Residential      1625 Wisteria Dr   
3  76753   April-1-2022                  Townhouse  4228 Burning Tree Dr   
4  58793            NaN  Single Family Residential      120 Fremont Loop   

            CITY STATE OR PROVINCE ZIP OR POSTAL CODE   PRICE  BEDS  BATHS  \
0    Weaverville                NC              28804  430000   3.0    2.0   
1     Kannapolis                NC              28025  324000   3.0    2.0   
2  Winston Salem                NC              27104  532500   3.0    3.0   
3      Charlotte                NC              28226  372000   3.0    2.0   
4    Mooresville                NC              28117  330000   3.0    2.0   

               LOCATION  SQUARE FEET  LOT SIZE  YEAR BUILT  $/SQUARE FEET 

In [3]:
dataFrame = dataFrame[dataFrame['SQUARE FEET'].notna()]
dataFrame = dataFrame[dataFrame['SOLD DATE'].notna()]
dataFrame = dataFrame[dataFrame['ZIP OR POSTAL CODE'].notna()]
dataFrame = dataFrame[dataFrame['YEAR BUILT'].notna()]

# Normalizing DataFrame Data

In [4]:
dataFrame['ZIP OR POSTAL CODE'] = pd.Series([int(x.split('-')[0].split(' ')[0]) if '-' in x or ' ' in x else int(x) for x in dataFrame['ZIP OR POSTAL CODE']])
# dataFrame['ZIP OR POSTAL CODE'] = dataFrame['ZIP OR POSTAL CODE'].astype(int)
df = dataFrame.copy(deep=True)
dataFrame = dataFrame.drop(columns=["URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)", 
                                    "LOCATION", 'PROPERTY TYPE', 'CITY', 'STATE OR PROVINCE', 'ADDRESS', '$/SQUARE FEET'])
df = df.drop(columns=["URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)", 
                                    "LOCATION", 'PROPERTY TYPE', 'CITY', '$/SQUARE FEET'])

In [5]:
dataFrame = dataFrame[dataFrame['SQUARE FEET'].notna()]
dataFrame = dataFrame[dataFrame['SOLD DATE'].notna()]
dataFrame = dataFrame[dataFrame['ZIP OR POSTAL CODE'].notna()]
dataFrame = dataFrame[dataFrame['YEAR BUILT'].notna()]

In [6]:
dataFrame['SOLD DATE'] = pd.to_datetime(dataFrame['SOLD DATE']).apply(lambda x: x.value)

In [7]:
# newColumns = ['ID',
# 'SOLD DATE',
# 'PROPERTY TYPE',
# 'ADDRESS',
# 'CITY',
# 'STATE OR PROVINCE',
# 'ZIP OR POSTAL CODE',
# 'BEDS',
# 'BATHS',
# 'SQUARE FEET',
# 'LOT SIZE',
# 'YEAR BUILT',
# '$/SQUARE FEET',
# 'HOA/MONTH',
# 'LATITUDE',
# 'LONGITUDE',
# 'PRICE']
# dataFrame = dataFrame[newColumns]

In [8]:
category_features = []
labels = dataFrame['PRICE']
training = dataFrame.drop(['ID'], axis = 1)

# Manipulating non-normalized dataframe

In [9]:
df['SOLD DATE'] = pd.to_datetime(df['SOLD DATE'])

In [10]:
print(df.set_index('SOLD DATE').loc['2022-3-1':'2022-6-1'])

               ID                                   ADDRESS STATE OR PROVINCE  \
SOLD DATE                                                                       
2022-04-01  76753                      4228 Burning Tree Dr                NC   
2022-04-20  15169                          1143 Larkspur Ln                NC   
2022-03-30  76378  18044 Pear Hawthorne Dr Unit 165 Adriana                NC   
2022-05-25  49750                   1161 Old Rock Quarry Rd                NC   
2022-03-30  64066                      3102 Little River Dr                NC   
...           ...                                       ...               ...   
2022-05-31   1929                    1320 Fillmore Ave #411                NC   
2022-03-03  43052                             8600 Glade Ct                NC   
2022-04-25  58043                           3535 Stanton Ct                NC   
2022-03-29  73032                 5053 Patton Dr Unit 1003D                NC   
2022-05-26  36083           

In [11]:
df.columns

Index(['ID', 'SOLD DATE', 'ADDRESS', 'STATE OR PROVINCE', 'ZIP OR POSTAL CODE',
       'PRICE', 'BEDS', 'BATHS', 'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT',
       'HOA/MONTH', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [12]:
dfView = df.set_index('SOLD DATE',drop=False).loc['2022-3-1':'2022-6-1'].sort_values('YEAR BUILT', ascending=False)
indexSeries = pd.Series(dfView.index.tolist())

In [13]:
chart = ipc.lineplot(data = dfView, x= 'YEAR BUILT', y= 'PRICE', hue='')
chart

Chart(layout=Layout(align_self='stretch', height='auto'))

In [14]:
# chart.to_html('./chartExample')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(training, labels, test_size=.3)
train_dataset = pd.get_dummies(X_train,columns=category_features)
test_dataset = pd.get_dummies(X_test,columns=category_features)
every_column_except_y= [col for col in train_dataset.columns if col not in ['PRICE','ID']]
train_X = train_dataset[every_column_except_y]
every_column_except_y= [col for col in test_dataset.columns if col not in ['PRICE','ID']]
test_X = test_dataset[every_column_except_y]
train_Y = train_dataset['PRICE']
test_Y = test_dataset['PRICE']

In [16]:
non_categorical_columns = [col for col in train_X.columns if col not in category_features and col not in ['ID']]
numeric_features = train_X[non_categorical_columns].dtypes[train_X.dtypes != "object"].index
train_X[numeric_features] = np.log1p(train_X[numeric_features])

non_categorical_columns = [col for col in test_X.columns if col not in category_features and col not in ['ID']]
numeric_features = test_X[non_categorical_columns].dtypes[test_X.dtypes != "object"].index
test_X[numeric_features] = np.log1p(test_X[numeric_features])

  train_X[numeric_features] = np.log1p(train_X[numeric_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
  test_X[numeric_features] = np.log1p(test_X[numeric_features])


In [17]:
# nan_features = ['LOCATION']
# def ConvertToNAString(data, columnsList):
#     for x in columnsList:
#         data[x] = str(data[x])
# ConvertToNAString(train_dataset, nan_features)
# ConvertToNAString(test_dataset, nan_features)
X_train.columns

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'PRICE', 'BEDS', 'BATHS',
       'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE',
       'LONGITUDE'],
      dtype='object')

In [18]:
def createColumnPerValue(data, columnsList):
  for x in columnsList:
    values = pd.unique(data[x])

    for y in values: 
      column_name = x + "_" + str(y)
      data[column_name]=(data[x]==y).astype(float)
    
    data.drop(x, axis=1, inplace=True)

# Establish ML model

In [19]:
 model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 tree_method='gpu_hist', 
                 gpu_id=0,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 
# model = xgboost.XGBRegressor()
# model.load_model("CurrentModel.json")

In [20]:
#df['time'] = df['time'].apply(lambda x: x.value)
print(train_dataset.columns)

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'PRICE', 'BEDS', 'BATHS',
       'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE',
       'LONGITUDE'],
      dtype='object')


In [21]:
print(train_X.columns)

Index(['SOLD DATE', 'ZIP OR POSTAL CODE', 'BEDS', 'BATHS', 'SQUARE FEET',
       'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE', 'LONGITUDE'],
      dtype='object')


In [22]:
# every_column_except_y= [col for col in X_train.columns if col not in ['PRICE','ID']]
model.fit(train_X, train_Y)
OrderedDict(sorted(model.get_booster().get_fscore().items(), key=lambda t: t[1], reverse=True))

OrderedDict([('SQUARE FEET', 10428.0),
             ('LOT SIZE', 9609.0),
             ('LATITUDE', 9510.0),
             ('YEAR BUILT', 8514.0),
             ('SOLD DATE', 8237.0),
             ('ZIP OR POSTAL CODE', 7389.0),
             ('HOA/MONTH', 6965.0),
             ('BATHS', 5233.0),
             ('BEDS', 3114.0)])

In [23]:
model.save_model("CurrentModel.json")

In [24]:
most_relevant_features= list( dict((k, v) for k, v in model.get_booster().get_fscore().items() if v >= 10).keys())
print(most_relevant_features)

['SOLD DATE', 'ZIP OR POSTAL CODE', 'BEDS', 'BATHS', 'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE']


In [25]:
y_pred = model.predict(test_X)
predictions = [round(value) for value in y_pred]
print(predictions)

[452721, 537343, 470393, 513354, 552680, 354818, 259376, 277884, 267160, 323831, 577550, 704185, 287556, 397887, 447515, 242662, 453696, 419077, 388172, 275931, 571701, 432776, 280839, 540428, 523155, 606852, 1701388, 311018, 406256, 513608, 250887, 605315, 553223, 267238, 349332, 1023016, 422424, 497016, 162155, 427615, 446183, 237229, 881442, 309824, 276586, 81431, 191180, 431986, 347968, 468803, 432366, 700341, 416322, 185795, 294512, 178706, 344153, 325051, 570248, 344048, 415341, 394046, 612820, 353124, 481677, 475658, 252681, 735422, 302210, 699638, 280628, 280257, 333534, 271419, 374712, 125628, 335489, 255247, 309654, 387626, 206907, 288167, 246207, 504268, 234346, 591716, 192949, 374023, 400993, 891455, 1400322, 541566, 218835, 266662, 313661, 252850, 380236, 844960, 174208, 489991, 404235, 213147, 230524, 462795, 303687, 273664, 475997, 125505, 425852, 535513, 408676, 385593, 392596, 473061, 469279, 175259, 400761, 295502, 307239, 557368, 375621, 483276, 288098, 483767, 37147

In [33]:
print(y_pred)

[452720.78 537343.3  470393.06 ... 373757.2  405479.16 405575.38]


In [26]:
accuracy = accuracy_score(test_Y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 0.00%


In [27]:
test_Y.head(100)

23234    443500
25340    571567
31310    505000
23207    700000
13049    500000
          ...  
31555    260000
9638     378000
27475    715000
23894    140000
38676    458500
Name: PRICE, Length: 100, dtype: int64

In [28]:
errlist = []
ty = list(test_Y)
for i, v in enumerate(predictions):
    errlist.append(abs(ty[i]-predictions[i]))

sum(errlist)/len(errlist)

86016.60468601008

In [29]:
errlist

[9221,
 34224,
 34607,
 186646,
 52680,
 30182,
 8376,
 10116,
 52840,
 63831,
 42450,
 220815,
 10816,
 52887,
 42515,
 40838,
 63696,
 30923,
 53172,
 24069,
 111801,
 33776,
 15261,
 35928,
 98155,
 6448,
 536388,
 34982,
 44824,
 101392,
 115887,
 189685,
 61147,
 53262,
 4818,
 123016,
 110076,
 27984,
 27845,
 77615,
 9683,
 49229,
 253558,
 84176,
 61586,
 68569,
 243820,
 11986,
 1932,
 38803,
 60366,
 115341,
 38203,
 126205,
 74512,
 151294,
 6653,
 25051,
 95248,
 11548,
 4659,
 144046,
 47680,
 78124,
 68323,
 295658,
 27319,
 134322,
 32790,
 100362,
 24472,
 257,
 102534,
 14419,
 9712,
 49372,
 4511,
 85247,
 90346,
 10126,
 2193,
 3167,
 78793,
 62732,
 9346,
 8284,
 4971,
 55977,
 20993,
 57545,
 325322,
 63434,
 13835,
 38338,
 103661,
 7150,
 2236,
 129960,
 34208,
 31491,
 70765,
 58147,
 22524,
 32205,
 61313,
 51336,
 104997,
 32495,
 30852,
 107487,
 42866,
 54407,
 73596,
 48061,
 9266,
 120259,
 95761,
 75502,
 57761,
 87368,
 1619,
 148276,
 128922,
 136233,
 