# 03b Feature Engineering (Merge Data)

In this notebook we merge our transaction data with our geo features and economic feautures (for our house price appreciation prediction model).

In [1]:
import pandas as pd
import numpy as np

We want to merge:
1. Transaction Data
2. Geo Features
3. Economic Features

In [2]:
# load the data
df_geo = pd.read_csv("../data/cleaned/df_geo_features.csv", index_col=0) # geo features
df_econ_monthly = pd.read_csv("../data/cleaned/monthly_econ_data.csv", index_col=0) # econ features
df_econ_yearly = pd.read_csv("../data/cleaned/yearly_econ_data.csv", index_col=0) # econ features
df_fairfax_cleaned = pd.read_csv("../data/cleaned/df_fairfax_appreciation_cleaned.csv", index_col=0) # transaction data
df_connecticut_cleaned = pd.read_csv("../data/cleaned/df_connecticut_appreciation_cleaned.csv", index_col=0) # transaction data

In [3]:
df_fairfax_cleaned["addr"] = df_fairfax_cleaned.addr.str.title()
df_fairfax_cleaned["city"] = df_fairfax_cleaned.city.str.title()
df_connecticut_cleaned["addr"] = df_connecticut_cleaned.addr.str.title()
df_connecticut_cleaned["city"] = df_connecticut_cleaned.city.str.title()

In [4]:
df_merge_fairfax = df_fairfax_cleaned.merge(df_geo, how="inner", on=["addr", "city", "county"])
df_merge_connecticut = df_connecticut_cleaned.merge(df_geo, how="inner", on=["addr", "city", "county"])

In [5]:
df_merge_fairfax["saledate"] = pd.to_datetime(df_merge_fairfax.saledate)
df_merge_fairfax["saledate"] = pd.to_datetime(df_merge_fairfax.saledate.dt.strftime("%Y-%m-%d"))
df_merge_fairfax["prior_saledate"] = pd.to_datetime(df_merge_fairfax.prior_saledate)
df_merge_fairfax["prior_saledate"] = pd.to_datetime(df_merge_fairfax.prior_saledate.dt.strftime("%Y-%m-%d"))

In [6]:
change_series = pd.to_datetime(df_merge_connecticut[df_merge_connecticut.saledate.str.len() > 10]["saledate"]).dt.strftime("%Y-%m-%d")
df_merge_connecticut.loc[(df_merge_connecticut.saledate.str.len() > 10), "saledate"] = change_series
df_merge_connecticut["saledate"] = pd.to_datetime(df_merge_connecticut.saledate)

In [7]:
change_series = pd.to_datetime(df_merge_connecticut[df_merge_connecticut.prior_saledate.str.len() > 10]["prior_saledate"]).dt.strftime("%Y-%m-%d")
df_merge_connecticut.loc[(df_merge_connecticut.prior_saledate.str.len() > 10), "prior_saledate"] = change_series
df_merge_connecticut["prior_saledate"] = pd.to_datetime(df_merge_connecticut.prior_saledate)

In [8]:
df_merge = pd.concat([df_merge_connecticut, df_merge_fairfax])

#### Merge Economic Data

In [9]:
df_econ_monthly["date"] = pd.to_datetime(df_econ_monthly.date)
df_econ_yearly["date"] = pd.to_datetime(df_econ_yearly.date)

In [10]:
df_econ_monthly["lag_month"] = df_econ_monthly.date.dt.month + 1 
df_econ_monthly["lag_year"] = df_econ_monthly.date.dt.year
df_econ_monthly["lag_year"] = np.where(df_econ_monthly.lag_month == 13, df_econ_monthly.lag_year +1, df_econ_monthly.lag_year)
df_econ_monthly["lag_month"] = df_econ_monthly.lag_month.replace(13, 1)

In [11]:
df_econ_yearly["lag_year"] = df_econ_yearly.date.dt.year +1 

In [12]:
df_econ_yearly["county"] = df_econ_yearly.county.str.title()

In [13]:
df_econ_monthly["county"] = df_econ_monthly.county.str.title()

In [14]:
df_merge["year"] = df_merge.saledate.dt.year
df_merge["month"] = df_merge.saledate.dt.month

In [15]:
df_merge["prior_year"] = df_merge.prior_saledate.dt.year
df_merge["prior_month"] = df_merge.prior_saledate.dt.month

In [16]:
df_merge = df_merge.merge(df_econ_yearly, how="left", left_on=["prior_year", "county"], right_on=["lag_year", "county"])

In [17]:
df_merge = df_merge.merge(df_econ_monthly, how="left", left_on=["prior_year", "prior_month", "county"], right_on=["lag_year", "lag_month","county"])

#### Merge Coordinates

In [18]:
coordinates = pd.read_csv("../data/cleaned/final_match.csv", index_col=0)

In [20]:
df_merge = df_merge.merge(coordinates, how="left", on=["addr", "city", "county"])

In [22]:
df_merge.to_csv("../data/cleaned/df_appreciation_final.csv")