### Data Engineering

In [1]:
# Add all the imports
import pandas as pd
import numpy as np

In [2]:
def map_idx(unique_attribute_vals: list):

    # Initialize the variables
    value_to_idx = {}
    idx_to_value = {}
    unique_int = 100

    # Loop through to map the values
    for value in unique_attribute_vals:
        value_to_idx[value] = unique_int
        idx_to_value[unique_int] = value
        unique_int = unique_int + 0.001

    # Return the two dictionaries
    return (value_to_idx, idx_to_value)

In [3]:
data_path = "input_data/NY-House-Dataset.csv"
all_data = pd.read_csv(data_path).dropna().reset_index()

In [4]:
features = [column for column in all_data.columns if column not in ("index", "PRICE", "BEDS", "BATH", "PROPERTYSQFT", "LATITUDE", "LONGITUDE")]

for feature in features:
    data = np.unique(all_data[feature].to_numpy())
    print(f"{feature} # of unique values: {len(data)}")

BROKERTITLE # of unique values: 1036
TYPE # of unique values: 13
ADDRESS # of unique values: 4582
STATE # of unique values: 308
MAIN_ADDRESS # of unique values: 4583
ADMINISTRATIVE_AREA_LEVEL_2 # of unique values: 29
LOCALITY # of unique values: 11
SUBLOCALITY # of unique values: 21
STREET_NAME # of unique values: 174
LONG_NAME # of unique values: 2731
FORMATTED_ADDRESS # of unique values: 4550


In [5]:
# row 1: min, row 2: median, row 3: mean, row 4: max, row 5: standard deviation
stat_cols = ["PRICE", "BEDS", "BATH", "PROPERTYSQFT"]
stat_rows = ["Min", "Median", "Mean", "Max", "Std."]
stats_df = pd.DataFrame(np.arange(20).reshape(5, 4), index=stat_rows, columns=stat_cols)

# Loop over colums and get fill the stats
for col in stat_cols:

    # Get the column and column data
    col_data = all_data[col].to_numpy()

    # Get all stats
    stats_dict = {}
    stats_dict["Min"], stats_dict["Max"] = col_data.min(), col_data.max()
    stats_dict["Median"], stats_dict["Mean"], stats_dict["Std."] = round(np.median(col_data), 2), round(np.average(col_data), 2),  round(np.std(col_data), 2)

    # Add the data
    for row in stat_rows:
        stats_df.loc[row, col] = stats_dict[row]

stats_df

Unnamed: 0,PRICE,BEDS,BATH,PROPERTYSQFT
Min,2494.0,1.0,0.0,230.0
Median,825000.0,3.0,2.0,2184.21
Mean,2356940.0,3.36,2.37,2184.21
Max,2147484000.0,50.0,50.0,65535.0
Std.,31351980.0,2.6,1.95,2376.89


In [6]:
all_data = all_data.loc[all_data["PRICE"].between(10000, 1000000000), :]
all_data = all_data.drop(columns=["ADMINISTRATIVE_AREA_LEVEL_2", "MAIN_ADDRESS", "ADDRESS", "FORMATTED_ADDRESS", "LONGITUDE", "LATITUDE", "LONG_NAME"])

In [7]:
# ONE-HOT ENCODE
# https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
def one_hot(dataframe, col, pre):
  encoded = pd.get_dummies(dataframe[col], prefix=pre)
  for column in encoded:
    encoded = encoded.rename(columns={column: col + "_" + column})
  encoded['index'] = dataframe['index']
  return encoded

In [8]:
# Mapping
attribute_idx_mapping = {}

categorical_attribute_list = ["BROKERTITLE", "TYPE", "STATE", "LOCALITY", "SUBLOCALITY", "STREET_NAME"]
one_hot_attribute_list = ["TYPE", "LOCALITY", "SUBLOCALITY"]

for attribute in categorical_attribute_list:
    if attribute in one_hot_attribute_list:
        encoded = one_hot(all_data, attribute, 'is')
        all_data = pd.merge(all_data, encoded, on=["index"])
    else:
        unique_attribute_values = all_data[attribute].unique()
        value_to_idx, idx_to_value = map_idx(unique_attribute_values)
        attribute_idx_mapping[attribute] = {"value_to_idx": value_to_idx, "idx_to_value": idx_to_value}
        all_data[attribute] = all_data[attribute].map(value_to_idx)

In [9]:
all_data.to_csv("input_data/model_ready_data.csv", index=False)
all_data.head()

Unnamed: 0,index,BROKERTITLE,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,LOCALITY,SUBLOCALITY,...,SUBLOCALITY_is_New York,SUBLOCALITY_is_New York County,SUBLOCALITY_is_Queens,SUBLOCALITY_is_Queens County,SUBLOCALITY_is_Rego Park,SUBLOCALITY_is_Richmond County,SUBLOCALITY_is_Riverdale,SUBLOCALITY_is_Snyder Avenue,SUBLOCALITY_is_Staten Island,SUBLOCALITY_is_The Bronx
0,0,100.0,Condo for sale,315000,2,2.0,1400.0,100.0,New York,Manhattan,...,False,False,False,False,False,False,False,False,False,False
1,1,100.001,Condo for sale,195000000,7,10.0,17545.0,100.001,New York,New York County,...,False,True,False,False,False,False,False,False,False,False
2,2,100.002,House for sale,260000,4,2.0,2015.0,100.002,New York,Richmond County,...,False,False,False,False,False,True,False,False,False,False
3,3,100.003,Condo for sale,69000,3,1.0,445.0,100.003,New York,New York County,...,False,True,False,False,False,False,False,False,False,False
4,4,100.004,Townhouse for sale,55000000,7,2.373861,14175.0,100.004,New York,New York County,...,False,True,False,False,False,False,False,False,False,False
