# 07 Modelling (Business Model)

In this notebook, we test the novel business model without making use of the hybrid model architecture.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier


In [2]:
df_train = pd.read_csv("data/df_train_app.csv", index_col=0).dropna(subset="population")
df_test = pd.read_csv("data/df_test_app.csv", index_col=0).dropna(subset="population")
df_val = pd.read_csv("data/df_val_app.csv", index_col=0).dropna(subset="population")

In [3]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

In [4]:
df_train_2 = df_train.dropna(subset="population")
df_test_2 = df_test.dropna(subset="population")
df_val_2 = df_val.dropna(subset="population")

In [5]:
df = pd.concat([df_train_2, df_test_2, df_val_2])

In [6]:
df_select = df[(round(df.appreciation_time/366, 0) == 5) & (df.prior_year == 2017)].sample(1000, random_state=42)

In [7]:
df_train = df[df.prior_year < 2017]

### Prepare the data

In [8]:
X_train_prior = df_train.drop(columns=["appreciation", "price", "id", "prior_saledate", "prior_price"])
X_select_prior = df_select.drop(columns=["appreciation","price", "id", "prior_saledate", "prior_price"])

y_train = df_train.loc[:, "price"]
y_select = df_select.loc[:, "price"]

y_train_prior = df_train.loc[:, "prior_price"]
y_select_prior = df_select.loc[:, "prior_price"]

In [9]:
columns = ['city', 'yrblt', 'effyrblt', 'nbed', 'nbath', 'nhalfbath', 'livarea',
       'efflivarea', 'distance_aerodrome', 'distance_ferry_terminal',
       'distance_railway_station', 'distance_market', 'distance_hospital',
       'distance_hotel', 'distance_museum', 'n_reli_inst', 'n_edu_fac',
       'n_healthcare', 'n_emergency', 'n_animalcare', 'n_commu_venu',
       'n_commu_serv', 'n_food_drink', 'n_financial', 'n_transport',
       'n_entertainment', 'n_sports', 'n_utilities', 'n_accommodation',
       'n_government_civic', 'n_recreational', 'year', 'hpi',
       'household_income', 'new_housing', 'population', 'n_poverty',
       'n_poverty_young', 'unemployment_rate', 'n_employed', 'age', 'eff_age',
       'longitude', 'latitude', 'county_Fairfield', 'county_Litchfield',
       'cond_desc_Average', 'cond_desc_Fair', 'cond_desc_Good',
       'cond_desc_Poor']

In [10]:
X_train_prior = X_train_prior[columns]
X_select_prior = X_select_prior[columns]

In [11]:
pca = PCA(n_components=3)

In [12]:
X_train_prior_2 = pd.concat([X_train_prior, pd.DataFrame(pca.fit_transform(X_train_prior), index=X_train_prior.index)], axis=1)
X_select_prior_2 = pd.concat([X_select_prior, pd.DataFrame(pca.fit_transform(X_select_prior), index=X_select_prior.index)], axis=1)

In [13]:
columns = ['longitude', 'latitude', 'county_Fairfax', 'county_Fairfield',
       'county_Hartford', 'county_Litchfield', 'county_Middlesex',
       'county_New Haven', 'county_New London', 'county_Tolland',
       'county_Windham', 'state_Connecticut', 'state_Virginia',
       'cond_desc_Average', 'cond_desc_Average Plus', 'cond_desc_Fair',
       'cond_desc_Good', 'cond_desc_Poor']
columns = list(set(X_train_prior_2.columns) & set(columns))
X_train_prior_2 = X_train_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})
X_select_prior_2 = X_select_prior_2.rename(columns={0:"pca_0", 1:"pca_1", 2:"pca_2"})

scaler = StandardScaler()

X_train_scaled = X_train_prior_2.drop(columns=columns)
X_select_scaled = X_select_prior_2.drop(columns=columns)
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_scaled), columns=X_train_scaled.columns, index=X_train_scaled.index)
X_select_scaled = pd.DataFrame(scaler.transform(X_select_scaled), columns=X_select_scaled.columns, index=X_select_scaled.index)
X_train_scaled = pd.concat([X_train_prior.loc[:, columns], X_train_scaled], axis=1)
X_select_scaled = pd.concat([X_select_prior.loc[:, columns], X_select_scaled], axis=1)

In [14]:
k = 6 # You can choose the number of clusters based on your analysis
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_train_scaled)



In [15]:
X_train_prior_2 = pd.concat([X_train_prior_2, pd.DataFrame(kmeans.transform(X_train_scaled), index=X_train_prior_2.index)], axis=1)
X_select_prior_2 = pd.concat([X_select_prior_2, pd.DataFrame(kmeans.transform(X_select_scaled), index=X_select_prior_2.index)], axis=1)

### Predict Prior Price

In [16]:
# predict prior price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model_prior = XGBRegressor(**parameters)
model_prior.fit(X_train_prior_2, y_train_prior)
y_hat_select_prior = model_prior.predict(X_select_prior_2)

In [17]:
y_hat_train_prior = model_prior.predict(X_train_prior_2)
y_hat_select_prior = model_prior.predict(X_select_prior_2)

In [18]:
X_train_appr_2 = X_train_prior_2.copy()
X_select_appr_2 = X_select_prior_2.copy()

In [19]:
X_train_appr_2["pred_prior_price"] = y_hat_train_prior
X_select_appr_2["pred_prior_price"] = y_hat_select_prior

In [20]:
# predict future price
parameters = {"objective": 'reg:squarederror',
            "n_estimators":350,  # Number of boosting rounds
            "learning_rate":0.1,  # Step size shrinkage
            "max_depth":10,  # Maximum depth of a tree
            }

    
model = XGBRegressor(**parameters)
model.fit(X_train_appr_2, y_train)
y_hat_select = model.predict(X_select_appr_2)
y_hat_train = model.predict(X_train_appr_2)

In [21]:
y_hat_appr = (y_hat_select - y_hat_select_prior)/y_hat_select_prior
y_appr = (y_select - y_hat_select_prior)/y_hat_select_prior
y_appr_core = (y_select - y_select_prior)/y_select_prior

In [22]:
X_select_cl = X_select_appr_2.copy()

In [23]:
X_select_cl["pred_future_price"] = y_hat_select
X_select_cl["true_future_price"] = y_select
X_select_cl["true_prior_price"] = y_select_prior
X_select_cl["pred_appr"] = y_hat_appr
X_select_cl["true_appr_basis_pred"] = y_appr
X_select_cl["true_appr_basis_true"] = y_appr_core

In [24]:
X_select_cl = X_select_cl.sort_values(by="pred_appr", ascending=False)

In [25]:
X_select_cl["absolut_increase_basis_pred"] = X_select_cl.true_future_price - X_select_cl.pred_prior_price
X_select_cl["absolut_increase_basis_true"] = X_select_cl.true_future_price - X_select_cl.true_prior_price

In [26]:
X_select_cl["one_perc_pred_prior_price"] = X_select_cl.pred_prior_price * 0.01
X_select_cl["one_perc_true_prior_price"] = X_select_cl.true_prior_price * 0.01
X_select_cl["one_perc_increase_basis_pred"] = X_select_cl.absolut_increase_basis_pred * 0.01
X_select_cl["pos_one_perc_increase_basis_pred"] = np.where(X_select_cl.one_perc_increase_basis_pred < 0, 0, X_select_cl.one_perc_increase_basis_pred)
X_select_cl["one_perc_increase_basis_true"] = X_select_cl.absolut_increase_basis_true * 0.01
X_select_cl["pos_one_perc_increase_basis_true"] = np.where(X_select_cl.one_perc_increase_basis_true < 0, 0, X_select_cl.one_perc_increase_basis_true)
X_select_cl["payment_difference"] = X_select_cl.pos_one_perc_increase_basis_pred - X_select_cl.pos_one_perc_increase_basis_true

In [27]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [28]:
X_select_cl["cumsum_prior_price"] = X_select_cl.one_perc_pred_prior_price.cumsum()

### Calculate Revenue

In [29]:
X_select_cl[X_select_cl["cumsum_prior_price"] < 50000].one_perc_increase_basis_pred.sum() / 50000

1.81788241953125

In [30]:
# invest remaining budget (under 1 % of prior_price)
remaining_budget = 50000 - X_select_cl[X_select_cl["cumsum_prior_price"] < 50000].cumsum_prior_price.max()
additional_index = X_select_cl[X_select_cl["cumsum_prior_price"] < 50000].reset_index().index[-1] + 1
percentage = remaining_budget / X_select_cl.reset_index().loc[additional_index]["pred_prior_price"]
additional_revenue = X_select_cl.reset_index().loc[additional_index]["absolut_increase_basis_pred"] * percentage
additional_revenue = additional_revenue if additional_revenue >= 0 else 0

In [31]:
selection_predicted = X_select_cl.iloc[:additional_index].index

In [32]:
# combine (if negative increase is shared!!)
revenue = X_select_cl[X_select_cl["cumsum_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() + additional_revenue

In [33]:
# yearly increase
pow((revenue + 50000) / 50000, 1/5) - 1

0.2421346772498656

#### True Appreciation with true prior price

In [34]:
X_select_cl["cumsum_true_prior_price"] = X_select_cl.one_perc_true_prior_price.cumsum()

In [35]:
X_select_cl[X_select_cl["cumsum_true_prior_price"] < 50000].one_perc_increase_basis_true.sum() / 50000

1.48629

In [36]:
# invest remaining budget (under 1 % of prior_price)
remaining_budget = 50000 - X_select_cl[X_select_cl["cumsum_true_prior_price"] < 50000].cumsum_prior_price.max()
additional_index_tmp = X_select_cl[X_select_cl["cumsum_true_prior_price"] < 50000].reset_index().index[-1] + 1
percentage = remaining_budget / X_select_cl.reset_index().loc[additional_index_tmp]["true_prior_price"]
additional_revenue = X_select_cl.reset_index().loc[additional_index_tmp]["absolut_increase_basis_true"] * percentage
additional_revenue = additional_revenue if additional_revenue >= 0 else 0

In [37]:
# combine (if negative increase is shared!!)
revenue = X_select_cl[X_select_cl["cumsum_true_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() + additional_revenue

In [38]:
# yearly increase
pow((revenue + 50000) / 50000, 1/5) - 1

0.2190234725134319

#### Analyse payment difference

In [39]:
# lets see how much each party pays to much/ gets less
X_select_cl.iloc[:additional_index].payment_difference.mean()

385.17114728009267

In [40]:
# how much in percent does 
(X_select_cl.payment_difference/ X_select_cl.pos_one_perc_increase_basis_pred).iloc[:additional_index].mean()

0.23684903232635204

In [41]:
# in 13 cases the investor receaved less
(X_select_cl.iloc[:additional_index].payment_difference < 0).sum()

8

In [42]:
X_select_cl.iloc[:additional_index].loc[(X_select_cl.iloc[:additional_index].payment_difference < 0)].payment_difference.mean()

-1171.5260888671874

In [43]:
# in 21 cases the investor receaved more 
(X_select_cl.iloc[:additional_index].payment_difference > 0).sum()

19

In [44]:
X_select_cl.iloc[:additional_index].loc[(X_select_cl.iloc[:additional_index].payment_difference > 0)].payment_difference.mean()

1040.622615131579

In [45]:
# average payment
X_select_cl.iloc[:additional_index].pos_one_perc_increase_basis_pred.mean()

3366.4489250578704

### Calculate Revenue with Random Selection

In [46]:
X_select_random = X_select_cl.sample(frac=1, random_state=42)

In [47]:
X_select_random["cumsum_prior_price"] = X_select_random.one_perc_pred_prior_price.cumsum()

In [48]:
X_select_random[X_select_random["cumsum_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() / 50000

0.35543552812499996

In [49]:
# invest remaining budget (under 1 % of prior_price)
remaining_budget = 50000 - X_select_random[X_select_random["cumsum_prior_price"] < 50000].cumsum_prior_price.max()
additional_index = X_select_random[X_select_random["cumsum_prior_price"] < 50000].reset_index().index[-1] + 1
percentage = remaining_budget / X_select_random.reset_index().loc[additional_index]["pred_prior_price"]
additional_revenue = X_select_random.reset_index().loc[additional_index]["absolut_increase_basis_pred"] * percentage
additional_revenue = additional_revenue if additional_revenue >= 0 else 0

In [50]:
# combine 
revenue = X_select_random[X_select_random["cumsum_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() + additional_revenue

In [51]:
# yearly increase
pow((revenue + 50000) / 50000, 1/5) - 1

0.0640806888417862

#### True appreciation with true prior price

In [52]:
X_select_random["cumsum_true_prior_price"] = X_select_random.one_perc_true_prior_price.cumsum()

In [53]:
X_select_random[X_select_random["cumsum_true_prior_price"] < 50000].one_perc_increase_basis_true.sum() / 50000

0.3520718

In [54]:
# invest remaining budget (under 1 % of prior_price)
remaining_budget = 50000 - X_select_random[X_select_random["cumsum_true_prior_price"] < 50000].cumsum_prior_price.max()
additional_index_tmp = X_select_random[X_select_random["cumsum_true_prior_price"] < 50000].reset_index().index[-1] + 1
percentage = remaining_budget / X_select_random.reset_index().loc[additional_index_tmp]["true_prior_price"]
additional_revenue = X_select_random.reset_index().loc[additional_index_tmp]["absolut_increase_basis_true"] * percentage
additional_revenue = additional_revenue if additional_revenue >= 0 else 0

In [55]:
# combine (if negative increase is shared!!)
revenue = X_select_random[X_select_random["cumsum_true_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() + additional_revenue

In [56]:
# yearly increase
pow((revenue + 50000) / 50000, 1/5) - 1

0.06389230159134729

### Calculate Revenue with Perfect Selection

In [57]:
X_select_perfect = X_select_cl.sort_values(by="true_appr_basis_pred", ascending=False)

In [58]:
X_select_perfect["cumsum_prior_price"] = X_select_perfect.one_perc_pred_prior_price.cumsum()

In [59]:
X_select_perfect[X_select_perfect["cumsum_prior_price"] < 50000].one_perc_increase_basis_pred.sum() / 50000

2.7925390546875

In [60]:
# invest remaining budget (under 1 % of prior_price)
remaining_budget = 50000 - X_select_perfect[X_select_perfect["cumsum_prior_price"] < 50000].cumsum_prior_price.max()
additional_index = X_select_perfect[X_select_perfect["cumsum_prior_price"] < 50000].reset_index().index[-1] + 1
percentage = remaining_budget / X_select_perfect.reset_index().loc[additional_index]["pred_prior_price"]
additional_revenue = X_select_perfect.reset_index().loc[additional_index]["absolut_increase_basis_pred"] * percentage
additional_revenue = additional_revenue if additional_revenue >= 0 else 0

In [61]:
selection_perfect = X_select_perfect.iloc[:additional_index].index

In [62]:
# combine 
revenue = X_select_perfect[X_select_perfect["cumsum_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() + additional_revenue

In [63]:
# yearly increase
pow((revenue + 50000) / 50000, 1/5) - 1

0.3089577654350446

#### True appreciation with true prior price

In [64]:
X_select_perfect["cumsum_true_prior_price"] = X_select_perfect.one_perc_true_prior_price.cumsum()

In [65]:
X_select_perfect[X_select_perfect["cumsum_true_prior_price"] < 50000].one_perc_increase_basis_true.sum() / 50000

1.3316435999999998

In [66]:
# invest remaining budget (under 1 % of prior_price)
remaining_budget = 50000 - X_select_perfect[X_select_perfect["cumsum_true_prior_price"] < 50000].cumsum_prior_price.max()
additional_index_tmp = X_select_perfect[X_select_perfect["cumsum_true_prior_price"] < 50000].reset_index().index[-1] + 1
percentage = remaining_budget / X_select_perfect.reset_index().loc[additional_index_tmp]["true_prior_price"]
additional_revenue = X_select_perfect.reset_index().loc[additional_index_tmp]["absolut_increase_basis_true"] * percentage
additional_revenue = additional_revenue if additional_revenue >= 0 else 0

In [67]:
# combine (if negative increase is shared!!)
revenue = X_select_perfect[X_select_perfect["cumsum_true_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() + additional_revenue

In [68]:
# yearly increase
pow((revenue + 50000) / 50000, 1/5) - 1

0.4879685820272186

#### How much from perfect selection is containe in our selection?

In [69]:
# around 70 % of the perfect selection is contained in our selection
len(set(selection_predicted) & set(selection_perfect)) / len(selection_perfect)

0.4827586206896552

### The True Perfect Appreciation

In [70]:
X_select_true_perfect = X_select_cl.sort_values(by="true_appr_basis_true", ascending=False)

In [71]:
X_select_true_perfect["cumsum_true_prior_price"] = X_select_true_perfect.one_perc_true_prior_price.cumsum()

In [72]:
X_select_true_perfect[X_select_true_perfect["cumsum_true_prior_price"] < 50000].one_perc_increase_basis_true.sum() / 50000

4.0775802

In [73]:
# invest remaining budget (under 1 % of prior_price)
remaining_budget = 50000 - X_select_true_perfect[X_select_true_perfect["cumsum_true_prior_price"] < 50000].cumsum_prior_price.max()
additional_index_tmp = X_select_true_perfect[X_select_true_perfect["cumsum_true_prior_price"] < 50000].reset_index().index[-1] + 1
percentage = remaining_budget / X_select_true_perfect.reset_index().loc[additional_index_tmp]["true_prior_price"]
additional_revenue = X_select_true_perfect.reset_index().loc[additional_index_tmp]["absolut_increase_basis_true"] * percentage
additional_revenue = additional_revenue if additional_revenue >= 0 else 0

In [74]:
selection_true_perfect = X_select_true_perfect.iloc[:additional_index].index

In [75]:
# combine (if negative increase is shared!!)
revenue = X_select_true_perfect[X_select_true_perfect["cumsum_true_prior_price"] < 50000].pos_one_perc_increase_basis_pred.sum() + additional_revenue

In [76]:
# yearly increase
pow((revenue + 50000) / 50000, 1/5) - 1

0.31543514585006727

#### How much from true perfect selection is containe in our selection?

In [77]:
# around 38 % of the perfect selection is contained in our selection
len(set(selection_predicted) & set(selection_true_perfect)) / len(selection_true_perfect)

0.2413793103448276