In [None]:
#This notebook cleans data from a real estate dataset creating a model 
#which can predict a house's valuation using its characteristics

#The final model has an accuracy of 97.8% (mean error percentage 2.2%) and a median absolute error of 1.67

In [None]:
#IMPORT LIBRARIES

In [None]:
import os; import pandas as pd; import numpy as np

In [None]:
#LOAD DATA

In [None]:
filename = 'zillow'

In [None]:
master = pd.read_csv(filename) #dataset with 80 variables

backup = master

In [None]:
#START RUNNING CODE FROM HERE

master = backup
master.shape

In [None]:
#remve unusable columns

master = backup.drop(columns='region_2')
master.shape

In [None]:
#remove INF and NaNs, replace with special value            #EDIT: just drop those rows since there's too many anyway

master = master.replace([np.inf, -np.inf], np.nan)
master = master.dropna()
master.shape

In [None]:
#reduce number of rows

master = master.sample(frac=0.35)
master.shape

In [None]:
#identify variables with highest nunique

master.nunique()

In [None]:
#IDENTIFY RELEVANT COLUMNS

data = master.columns.drop(['new_id', 'points'])
dataset = master[data]
master.shape

In [None]:
#reduce number of columns (after categorical -> indicator) by dropping variable with highest nunique

data = data.drop('designation')
dataset = master[data]
dataset.nunique()

In [None]:
#use dummy encoder to convert categorical variables to indicators

nonNumeric = data.drop(dataset[data].select_dtypes('number').columns)

dataNon = dataset[nonNumeric]

dataDummy = pd.get_dummies(dataNon)

In [None]:
#replace categorical variables with indicator variables 

dataset = dataset.drop(columns=nonNumeric)

dataset[dataDummy.columns] = dataDummy

In [None]:
#update data then proceed to model

data = dataset.columns

In [None]:
#SPLIT INTO TRAIN AND TEST

from sklearn.model_selection import train_test_split

trainData, testData, trainTarget, testTarget = train_test_split(dataset, master['points'], 
                                                                test_size = 0.4, random_state = 42)


In [None]:
#sanity check

trainData.shape

In [None]:
#   4-STEP MODELLING PROCESS: IMPORT WHICH MODEL, MAKE INSTANCE OF MODEL, TRAIN USING FIT, PREDICT LABELS OF TESTDATA

In [1]:
from xgboost import XGBRegressor

In [2]:
model = XGBRegressor(base_score=)

In [None]:
model.fit(trainData, trainTarget)

In [None]:
prediction = model.predict(testData)   #predict probabilities, for ROC and KS
prediction

In [None]:
prediction.mean()

In [None]:
#EVALUATE MODEL USING METRICS

In [None]:
model.score(trainData, trainTarget)

In [None]:
#error percentage

((prediction - testTarget)/testTarget * 100).abs().mean()

In [None]:
#side-by-side comparison

from sklearn.metrics import median_absolute_error as scr

scr(testTarget, prediction)

In [None]:
from sklearn.metrics import mean_squared_log_error as scor
scor(testTarget, prediction)            #multioutput not necessary

In [None]:
from sklearn.metrics import explained_variance_score as scorev
scorev(testTarget, prediction)            #multioutput not necessary