# 01 Modelling Prep (Neighborhood Prices)

In this notebook, we calculate an additional feature: the past sale prices in the neighborhood.

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import geopandas as gpd
from shapely.geometry import Point
from shapely.geometry import Polygon
import time
import h3

In [None]:
df_train = pd.read_csv("data/df_train_2.csv", index_col=0)
df_test = pd.read_csv("data/df_test_2.csv", index_col=0)
df_val = pd.read_csv("data/df_val_2.csv", index_col=0)

In [None]:
X_train = df_train.drop(columns=["price", "id"])
X_test = df_test.drop(columns=["price", "id"])
X_val = df_val.drop(columns=["price", "id"])

y_train_2 = df_train.loc[:, "price"]
y_test_2 = df_test.loc[:, "price"]
y_val_2 = df_val.loc[:, "price"]

In [None]:
# some loss functions
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def smape(y_true, y_pred):
    return np.mean(200 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def rmspe(y_true, y_pred):
    epsilon = 1e-6  # Small constant to avoid division by zero
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + epsilon)) ** 2))

### Get past sale price of neighbours

In [None]:
columns = ["id", 'city', 'longitude', 'latitude', 'year','month', "price"]

In [None]:
df_train_f = df_train[columns]
df_test_f = df_test[columns]
df_val_f = df_val[columns]

In [None]:
# Create a GeoDataFrame
gdf_train = gpd.GeoDataFrame(
    df_train_f, 
    geometry=gpd.points_from_xy(df_train_f.longitude, df_train_f.latitude)
)
gdf_train.set_crs(epsg=4326, inplace=True)
gdf_train["geometry"] = gdf_train.geometry.to_crs("EPSG:3857")


In [None]:
# Create a GeoDataFrame
gdf_test = gpd.GeoDataFrame(
    df_test_f, 
    geometry=gpd.points_from_xy(df_test_f.longitude, df_test_f.latitude)
)
gdf_test.set_crs(epsg=4326, inplace=True)
gdf_test["geometry"] = gdf_test.geometry.to_crs("EPSG:3857")


In [None]:
gdf_val = gpd.GeoDataFrame(
    df_val_f, 
    geometry=gpd.points_from_xy(df_val_f.longitude, df_val_f.latitude)
)
gdf_val.set_crs(epsg=4326, inplace=True)
gdf_val["geometry"] = gdf_val.geometry.to_crs("EPSG:3857")


In [None]:
def neighbours_sale_price(df, x):
    df_tmp = df.loc[(df.id != x.id)]
    df_tmp = df_tmp.loc[(df.year.isin([x.year-1, x.year-2]))  & (df.city == x.city)]
    df_tmp["distance"] = df_tmp.geometry.centroid.distance(x.geometry)
    df_tmp = df_tmp.sort_values(by=["distance", "year"])
    return df_tmp.head(5).price.mean()

In [None]:
def neighbours_sale_price(df, x):
    # Filter the DataFrame for relevant rows
    df_tmp = df.loc[(df.id != x.id) & (df.year < x.year) & (df.city == x.city)]
    
    if df_tmp.shape[0] == 0:
        return 0
    # Calculate the distances and get the smallest 5 distances directly
    df_tmp['distance'] = df_tmp.geometry.centroid.apply(lambda geom: geom.distance(x.geometry))
    df_tmp = df_tmp.nsmallest(5, columns=['distance', 'year'])

    return df_tmp.price.mean()


In [None]:
def neighbours_sale_price_2(df, x, radius=1500):
    df_tmp = df.loc[(df.id != x.id)]
    df_tmp = df_tmp.loc[(df.year.isin([x.year-1, x.year-2, x.year-3, x.year-5, x.year-6]))  & (df.city == x.city)]
    buffer = x.geometry.buffer(radius)
    points_within = df_tmp.intersects(buffer)
    return points_within.sum()

In [None]:
def neighbours_sale_price_2(df, x, radius=1500):
    # Combine filtering steps into one operation
    df_filtered = df[(df.id != x.id) & 
                     (df.year.isin([x.year-1, x.year-2, x.year-3, x.year-5, x.year-6])) & 
                     (df.city == x.city)]
    
    # Buffer calculation
    buffer = x.geometry.buffer(radius)
    
    # Efficient spatial operation to check intersection
    points_within = df_filtered.geometry.apply(buffer.intersects)
    print(points_within.sum())
    return df_filtered[points_within].price.mean()

### Uber H3

In [None]:
def geometry_to_h3(geometry, resolution):
    return h3.geo_to_h3(geometry.y, geometry.x, resolution=resolution)

In [None]:
def h3_to_geometry(h3_index):
    return Polygon(h3.h3_to_geo_boundary(h3_index, geo_json=True))

In [None]:
def get_grouped_h3_df(df, feature, resolution, log=False, year=None):
    if year:
        df = df[df["year"] == year]
    df["h3"] = df.geometry.map(lambda x: geometry_to_h3(x, resolution))
    grouped = df.groupby(by="h3")[feature].mean().to_frame().reset_index()
    grouped["geometry"] = grouped.h3.map(h3_to_geometry)
    if log:
        grouped[feature] = np.log(grouped[feature])
    return gpd.GeoDataFrame(grouped, crs='EPSG:4326', geometry='geometry')

In [None]:
def get_past_sales_in_neigh(gdf_sales, gdf, res):
    df_grouped = pd.DataFrame()
    gdf_tmp = gdf_sales.copy()
    gdf_tmp = gdf_tmp.drop_duplicates(subset="id")
    for year in gdf_tmp.year.unique():
        grouped = get_grouped_h3_df(gdf_tmp, "price", res, year=year)
        grouped["year"] = year
        df_grouped = pd.concat([df_grouped, grouped])

    gdf["h3"] = gdf.geometry.map(lambda x: geometry_to_h3(x, res))
    gdf["prev_year"] = gdf["year"]-1

    gdf_sorted = gdf.reset_index().sort_values(by=['prev_year'])
    df_grouped_sorted = df_grouped.sort_values(by=['year'])

    # Perform the asof merge
    merged_df = pd.merge_asof(
        gdf_sorted, 
        df_grouped_sorted, 
        left_on='prev_year', 
        right_on="year",
        left_by='h3', 
        right_by='h3', 
        direction='backward', 
        allow_exact_matches=True,  # This ensures we only get previous years, not the same year
        suffixes=('', '_right')
    )
    return merged_df[["index", "id", "longitude", "year", "month", "price", "h3", "price_right", "year_right"]].rename(columns={"price_right": "neigh_price"}).set_index("index").sort_index()

In [None]:
df_train = df_train.sort_index()
df_test = df_test.sort_index()
df_val = df_val.sort_index()

In [None]:
merge_df = get_past_sales_in_neigh(gdf_train, gdf_train, 0)

In [None]:
df_train["price_level"] = np.nan
for i in range(8):
    merge_df = get_past_sales_in_neigh(gdf_train, gdf_train, i)
    merge_df.loc[((merge_df["year"] - merge_df["year_right"]) > 10), "neigh_price"] = np.nan
    df_train["price_level"] = np.where(df_train.price_level.isna(), merge_df.neigh_price, df_train.price_level)

In [None]:
df_test["price_level"] = np.nan
for i in range(8):
    merge_df = get_past_sales_in_neigh(gdf_train, gdf_test, i)
    merge_df.loc[((merge_df["year"] - merge_df["year_right"]) > 10), "neigh_price"] = np.nan
    df_test["price_level"] = np.where(df_test.price_level.isna(), merge_df.neigh_price, df_test.price_level)

In [None]:
df_val["price_level"] = np.nan
for i in range(8):
    merge_df = get_past_sales_in_neigh(gdf_train, gdf_val, i)
    merge_df.loc[((merge_df["year"] - merge_df["year_right"]) > 10), "neigh_price"] = np.nan
    df_val["price_level"] = np.where(df_val.price_level.isna(), merge_df.neigh_price, df_val.price_level)

In [None]:
df_train.to_csv("data/past_sales/df_train_2.csv")

In [None]:
df_test.to_csv("data/past_sales/df_test_2.csv")

In [None]:
df_val.to_csv("data/past_sales/df_val_2.csv")