### Data Engineering

In [1]:
# Add all the imports
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [2]:
# Get the data and drop the NA values.
data_path = "input_data/NY-House-Dataset.csv"
all_data = pd.read_csv(data_path).dropna().reset_index()

# Get rid of the specific TYPEs that are not needed in the data
for type in ["Foreclosure", "Pending", "Coming Soon", "Land for sale", "Contingent", "For sale"]:
    all_data.drop(all_data[all_data['TYPE'] == type].index, inplace=True)

# Rename the types and save them to a separate column. Do the same as above for the brokertitle too
type_save = [tpe[:-9].replace(" ", "-") for tpe in all_data["TYPE"].to_numpy().tolist()]
all_data["Type"] = type_save

all_data = all_data.loc[all_data["PRICE"].between(10000, 1000000000), :]
all_data = all_data.drop(columns=["ADMINISTRATIVE_AREA_LEVEL_2", "MAIN_ADDRESS", "ADDRESS", 
                                  "FORMATTED_ADDRESS", "LONGITUDE", "LATITUDE", "LONG_NAME", "BROKERTITLE", "TYPE", "LOCALITY", "SUBLOCALITY", "STATE", "STREET_NAME"])

In [None]:
# row 1: min, row 2: median, row 3: mean, row 4: max, row 5: standard deviation
stat_cols = ["PRICE", "BEDS", "BATH", "PROPERTYSQFT"]
stat_rows = ["Min", "Median", "Mean", "Max", "Std."]
stats_df = pd.DataFrame(np.arange(20).reshape(5, 4), index=stat_rows, columns=stat_cols)

# Loop over colums and get fill the stats
for col in stat_cols:

    # Get the column and column data
    col_data = all_data[col].to_numpy()

    # Get all stats
    stats_dict = {}
    stats_dict["Min"], stats_dict["Max"] = col_data.min(), col_data.max()
    stats_dict["Median"], stats_dict["Mean"], stats_dict["Std."] = round(np.median(col_data), 2), round(np.average(col_data), 2),  round(np.std(col_data), 2)

    # Add the data
    for row in stat_rows:
        stats_df.loc[row, col] = stats_dict[row]

stats_df

In [None]:
def map_idx(unique_attribute_vals: list):

    # Initialize the variables
    value_to_idx = {}
    idx_to_value = {}
    unique_int = 100

    # Loop through to map the values
    for value in unique_attribute_vals:
        value_to_idx[value] = unique_int
        idx_to_value[unique_int] = value
        unique_int = unique_int + 0.001

    # Return the two dictionaries
    return (value_to_idx, idx_to_value)

In [None]:
# ONE-HOT ENCODE
# https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
def one_hot(dataframe, col, pre):
  encoded = pd.get_dummies(dataframe[col], prefix=pre)
  for column in encoded:
    encoded = encoded.rename(columns={column: col + "_" + column})
  encoded['index'] = dataframe['index']
  return encoded

In [None]:
label_encoder = preprocessing.LabelEncoder()
all_data["Type"] = label_encoder.fit_transform(all_data["Type"])

In [None]:
# One hot encoding only the Type.
# encoded = one_hot(all_data, "Type", 'is')
# all_data = pd.merge(all_data, encoded, on=["index"])
# all_data = all_data.drop(columns=["Type", "index"])

In [None]:
all_data = all_data.drop(columns=["index"])
all_data.to_csv("input_data/model_ready_data.csv", index=False)
all_data