In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import fastai; fastai.__version__

'1.0.61'

In [None]:
import pandas as pd
from fastai.tabular import *

In [None]:
# the datasets
house_price_df = pd.read_csv("/content/drive/MyDrive/QUTREProject/Brisbane_suburbs_02_sales_2020_property_GB_Prop_dist16.csv", index_col=0)
house_price_df

In [None]:
test_df = house_price_df.loc[(house_price_df['suburb_id_x']=='QLD1346') | (house_price_df['suburb_id_x']=='QLD627') | (house_price_df['suburb_id_x']=='QLD24') | \
                         (house_price_df['suburb_id_x']=='QLD3231') | (house_price_df['suburb_id_x']=='QLD32') | (house_price_df['suburb_id_x']=='QLD510') | \
                         (house_price_df['suburb_id_x']=='QLD545') | (house_price_df['suburb_id_x']=='QLD718') | (house_price_df['suburb_id_x']=='QLD389') ]
test_df

In [None]:
# train_df=house_price_df.sample(frac=0.8,random_state=200) #random state is a seed value
# test_df=house_price_df.drop(train_df.index)
train_df=house_price_df.drop(test_df.index)
train_df.reset_index()
test_df.reset_index()

In [None]:
train_df.reset_index(drop=True, inplace=True)
train_df

In [None]:
test_df.reset_index(drop=True, inplace=True)
test_df

In [None]:
# Check the length of the dataset
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
print(train_df.columns)

In [None]:
train_df.describe()

In [None]:
train_df.isnull().sum()

In [None]:
# Let's check for the test set as well
test_df.isnull().sum()

In [None]:
# Preprocessing
procs = [FillMissing, Categorify, Normalize]
#procs = [FillMissing, Normalize]

In [None]:
# Split our variables into target, categorical and continuous variables
dep_var = 'price_value'

# cat_names = ['suburb_id_x']
cat_names = []
cont_names = ['propertyFeatures_bedrooms_x', 'propertyFeatures_bathrooms_x', 'propertyFeatures_carparks_x', 'landDetails_propertyArea_x', 'school_dist'\
              , 'hospital_dist', 'uni_dist', 'park_dist', 'mall_dist', 'CBD_dist', 'First_year_transaction', 'highway_dist', 'bus_stop_dist', \
       'ferry_terminal_dist', 'railway_station_dist', 'number_transaction', \
       'crime_rate_2019', 'state_school_percentage', \
       'class_size_achievement_pct_2019', 'Density', 'Score2019']

print(cat_names)
print(cont_names)

In [None]:
test = TabularList.from_df(df=test_df, cat_names=cat_names, cont_names=cont_names, procs=procs)

In [None]:
data = (TabularList.from_df(df=train_df, cat_names=cat_names, cont_names=cont_names, procs=procs)
                   .split_by_rand_pct()
                   .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                   .add_test(test)
                   .databunch())

In [None]:
data.show_batch(10)

In [None]:
learn = tabular_learner(data, layers=[], metrics=[rmse,r2_score])

In [None]:
#Show the complete Summary of the model
learn.summary

In [None]:
learn.model_dir = '/content/drive/MyDrive/Fastai/'

In [None]:
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
min_grad_lr =1e-2 # learn.recorder.min_grad_lr
learn.fit_one_cycle(30, slice(min_grad_lr))

In [None]:
learn.recorder.plot_losses()

In [None]:
#Display Predictions On Training Data
learn.show_results(ds_type=DatasetType.Train,rows = 5)
#Display Predictions On Validation Data
learn.show_results(ds_type=DatasetType.Valid)

In [None]:
#Getting The Training And Validation Errors
tr = learn.validate(learn.data.train_dl)
va = learn.validate(learn.data.valid_dl)
print("The Metrics used In Evaluating The Network:", str(learn.metrics))
print("\nThe calculated RMSE & R-Squared For The Training Set :", tr[1:])
print("\nThe calculated RMSE & R-Squared For The Validation Set :", va[1:])

In [None]:
#Plotting Momentum & Learning Rate
learn.recorder.plot_lr(show_moms=True)
#Plotting the metrics of evaluation
learn.recorder.plot_metrics()

In [None]:
learn.model.embeds

In [None]:
variable = 'suburb_id_x'
ix = cat_names.index(variable)

var_vals = list(df[variable].astype('category').cat.categories.values)
nval = len(var_vals)
print(f'Number of values: {nval}')
print(var_vals)

In [None]:
import altair as alt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

emb_mx = to_np(next(learn.model.embeds[ix].parameters()))
# X_emb = TSNE(n_components=2, perplexity=3).fit_transform(emb_mx)
X_emb = PCA(n_components=2).fit_transform(emb_mx)
# annotation = np.append('Other', np.array(var_vals))
annotation = np.array(var_vals)

In [None]:
print(annotation)

In [None]:
emb_df = pd.DataFrame(X_emb, columns=['Dim1', 'Dim2'])
emb_df[variable]=annotation

In [None]:
points = alt.Chart(emb_df).mark_circle(size=60).encode(
    x='Dim1',
    y='Dim2',
    tooltip=[variable]
)

text = points.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text=variable
)

points + text

In [None]:
test_predictions = learn.get_preds(ds_type=DatasetType.Test)[0]

In [None]:
import numpy as np

#Converting the tensor output to a list of predicted values
test_predictions = [np.exp(p[0].data.item()) for p in test_predictions]

# Create "submission.csv" file
submission = pd.DataFrame({'Id': test_df['id_combined'], 'price_value': test_df['price_value'], 'Predicted': test_predictions})
submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
import pandas
import matplotlib.pyplot as plt
import statsmodels.api
import statsmodels.formula.api as sm
import scipy.stats as stats

In [None]:
fig = plt.figure(figsize=[10, 10])
ax = fig.add_subplot(2, 2, 1)
ax.scatter(submission['price_value'], submission['Predicted'])
ax.set_xlabel('value')
ax.set_ylabel('predicted')
ax.set_title(str(submission['Predicted'].corr(submission['price_value'])))

In [None]:
import numpy as np
Y_true = submission['price_value']
pred = submission['Predicted']


#RMSLE
error = np.square(np.log(pred + 1) - np.log(Y_true +1)).mean() ** 0.5
print(error)

score = 1 - error
print("SCORE For test : ",score)


In [None]:
from sklearn.metrics import r2_score

r2_score(Y_true,pred)

In [None]:
import sklearn

mse = sklearn.metrics.mean_squared_error(Y_true, pred)

rmse = math.sqrt(mse)

print(rmse)

In [None]:
dfCopy = test_df.copy()
dfCopy

In [None]:
dfCopy['Predicted'] = test_predictions
dfCopy

In [None]:
dfCopy.to_csv('/content/drive/MyDrive/QUTREProject/2020_Hedonic_Pricing_Test_results_experiment2_Tminus1.csv')
dfCopy.to_pickle('/content/drive/MyDrive/QUTREProject/2020_Hedonic_Pricing_Test_results_experiment2_Tminus1.pkl')