In [16]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
import json
import itertools
import os

## KC sample creation

In [None]:
# 1. load data and drop duplicate locations
df=pd.read_csv('kc_house_data.csv')
df.drop_duplicates(subset=['lat','long'],inplace=True)

# 2. restrict data
def inBounds(lat,long):
    return lat<47.65 and lat>47.59 and long>-122.33 and long<-122.28 and (lat > -0.925*long - 65.547)
df=df[df.apply(lambda x: inBounds(x['lat'],x['long']),axis=1)]
fig,ax=plt.subplots()
ax.set_title(f'n={len(df)}')
ax.scatter(df['long'],df['lat'])

# 3. save data
df.to_csv('kc_house_sample.csv')
print(f"center={df['longitude'].mean(),df['latitude'].mean()}")

## NY sample creation

In [None]:
# 1. load data and drop duplicate locations
df=pd.read_csv('AB_NYC_2019.csv')
df.drop_duplicates(subset=['latitude','longitude'],inplace=True)

# 2. restrict data
df=df[df.apply(lambda x: x['neighbourhood_group']=='Bronx',axis=1)]
fig,ax=plt.subplots()
ax.set_title(f'n={len(df)}')
ax.scatter(df['longitude'],df['latitude'])

# 3. replace NaN with 0 in reviews_per_month
df['reviews_per_month'].fillna(0,inplace=True)

# 3. save data
df.to_csv('ny_airbnb_sample.csv')
print(f"center={df['longitude'].mean(),df['latitude'].mean()}")

## Post-processing

In [77]:
# 1. set names for coefficient files
# 1.1 get names in the order they are created
modules_all_spatial = ["mgwr","smgwr"]
modules_all_ml = ["random_forest","neural_network","xgb"]
modules_all_data = ["kc", "ny"]
names = list(
    map(
        lambda x: "_".join(x),
        list(itertools.product(modules_all_spatial, modules_all_ml, modules_all_data)),
    )
)
# 1.2 get files in the order they are created
directory = '/users/nicolaslee/agwr/trained_models'
files = [f for f in os.listdir(directory) if f.startswith('coefficients')]
files.sort()

# 1.3 rename each file
for name, file in zip(names, files):
    old_file = os.path.join(directory, file)
    new_file = os.path.join(directory, name + "_coefficients.pkl")
    os.rename(old_file, new_file)

# 2. generate data for each name
# 2.1 load datasets
kc = pd.read_csv('/users/nicolaslee/desktop/pyneapple-demo/server//kc_house_sample.csv')
ny = pd.read_csv("/users/nicolaslee/desktop/pyneapple-demo/server/ny_airbnb_sample.csv")
for name in names:
    is_kc = name[-2:] == "kc"
    # 2.2 load coefficients
    filename = os.path.join(directory, name + "_coefficients.pkl")
    with open(filename, "rb") as file:
        coefficients = pickle.load(file)
    coefficient_min = coefficients.min(axis=0)
    coefficient_median = np.median(coefficients, axis=0)
    coefficient_max = coefficients.max(axis=0)
    # 2.3 load predictions
    predictions = pd.read_csv(os.path.join(directory,name+'_predictions.csv'))
    if (is_kc):
        df = kc.copy()
    else:
        df = ny.copy()
    df['predicted']=predictions['predicted'].values
    # 2.4 append parameters to end of file
    filename = os.path.join(directory, name + "_parameters.txt")
    with open(filename, "a") as file:
        file.write(
            "coefficientMins:"
            + json.dumps(list(map(lambda x: json.loads(json.dumps(x)), coefficient_min)))
            + ",coefficientMeds:"
            + json.dumps(list(map(lambda x: json.loads(json.dumps(x)), coefficient_median)))
            + ",coefficientMaxes:"
            + json.dumps(list(map(lambda x: json.loads(json.dumps(x)), coefficient_max)))
            + ","
        )
        print(f'Coefficient statistics written to "{filename}"')
    # 2.5. write data in JSON
    filename = os.path.join(directory, name + ".ts")
    with open(filename, "w") as file:
        file.write('import { Dataset } from "data/data";')
        file.write("const DATASET:Dataset = {")
        file.write(f'name:"{name}",')
        file.write(
            f'center:{[47.61729305740989, -122.30174365821095] if is_kc else [40.844, -73.87]},'
        )
        file.write(f'zoom:{14 if is_kc else 13},')
        with open(os.path.join(directory, name + "_parameters.txt"),'r') as parameters_file:
            file.write(parameters_file.read())
        file.write("data: [")

        feature_labels = ([
                                    "intercept",
                                    "bedrooms",
                                    "bathrooms",
                                    "sqft_living",
                                    "sqft_lot",
                                    "floors",
                                ]
                                if is_kc
                                else [
                                    "intercept",
                                    "minimum_nights",
                                    "number_of_reviews",
                                    "reviews_per_month",
                                    "host_listings_count",
                                    "availability_365",
                                ])

        def toJSON(df):
            i = 0
            for _, row in df.iterrows():

                coefficient_dict = {
                    feature: value
                    for feature, value in zip(
                        feature_labels,
                        list(map(lambda x: json.loads(json.dumps(x)), coefficients[i])),
                    )
                }

                file.write(
                    "{latitude:"
                    + str(row["latitude"])
                    + ", longitude:"
                    + str(row["longitude"])
                    + ",actual:"
                    + str(row["price"])
                    + ",predicted:"
                    + str(int(row["predicted"]))
                    + ",coefficients:"
                    + json.dumps(coefficient_dict,indent=4)
                    + "},"
                )
                i += 1

        toJSON(df)
        file.write("]};export default DATASET;")
        print(f'Data written to "{filename}"')

Data written to "/users/nicolaslee/agwr/trained_models/mgwr_random_forest_kc.ts"
Data written to "/users/nicolaslee/agwr/trained_models/mgwr_random_forest_ny.ts"
Data written to "/users/nicolaslee/agwr/trained_models/mgwr_neural_network_kc.ts"
Data written to "/users/nicolaslee/agwr/trained_models/mgwr_neural_network_ny.ts"
Data written to "/users/nicolaslee/agwr/trained_models/mgwr_xgb_kc.ts"
Data written to "/users/nicolaslee/agwr/trained_models/mgwr_xgb_ny.ts"
Data written to "/users/nicolaslee/agwr/trained_models/smgwr_random_forest_kc.ts"
Data written to "/users/nicolaslee/agwr/trained_models/smgwr_random_forest_ny.ts"
Data written to "/users/nicolaslee/agwr/trained_models/smgwr_neural_network_kc.ts"
Data written to "/users/nicolaslee/agwr/trained_models/smgwr_neural_network_ny.ts"
Data written to "/users/nicolaslee/agwr/trained_models/smgwr_xgb_kc.ts"
Data written to "/users/nicolaslee/agwr/trained_models/smgwr_xgb_ny.ts"
