# Capstone 3 Modeling

**The Data Science Method**  

1.   Problem Identification 

2.   Data Wrangling 
  * Data Collection 
   * Data Organization
  * Data Definition 
  * Data Cleaning
 
3.   Exploratory Data Analysis
 * Build data profile tables and plots
        - Outliers & Anomalies
 * Explore data relationships
 * Identification and creation of features

4.   Pre-processing and Training Data Development
  * Create dummy or indicator features for categorical variables
  * Standardize the magnitude of numeric features
  * Split into testing and training datasets
  * Apply scaler to the testing set
  
5.   **Modeling**
  * Fit Models with Training Data Set
  * Review Model Outcomes — Iterate over additional models as needed.
  * Identify the Final Model

6.   Documentation
  * Review the Results
  * Present and share your findings - storytelling
  * Finalize Code 
  * Finalize Documentation

## Data Collection

In [1]:
#load python packages
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.4f' % x) #get rid of scientific notations
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import time
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.api import VAR
from IPython.display import Image
%matplotlib inline

In [2]:
# switch to processed data folder
os.chdir('C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\data\\processed\\')
os.getcwd()

'C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\data\\processed'

In [None]:
# load the combined csv file
df = pd.read_csv('combined.csv')
df.head()

In [None]:
# check info
df['Date'] = pd.to_datetime(df.Date)
df.info()

In [None]:
# break the df into 4 of them indicating the number of bedrooms and drop the county and bedrooms column
df1 = df.loc[df['Bedrooms']==1]
df1 = df1.sort_values(['Date','Zipcode'])
df2 = df.loc[df['Bedrooms']==2]
df2 = df2.sort_values(['Date','Zipcode'])
df3 = df.loc[df['Bedrooms']==3]
df3 = df3.sort_values(['Date','Zipcode'])
df4 = df.loc[df['Bedrooms']==4]
df4 = df4.sort_values(['Date','Zipcode'])
df1.tail() # previous when the date wasn't in date time and sort by date will see 2019 in tail()

# Preprocess for VARMAX model
need the values to be differenced first then cube root it

In [None]:
# define a function that takes in the df, and the number of diff value
def data_trans(df, val_name):
    temp = pd.DataFrame({})
    # add Zipcode as columns and its values to temp
    for zipcode in list(df.Zipcode.unique()):
        temp[zipcode] = list(df[df['Zipcode']==zipcode].Value)
    # difference the data
    temp = temp.diff().dropna()
    # add Date column to so we can melt it, starting date is 1996-2-29 periods=293 since it's differenced and lost 1 month
    temp['Date'] = pd.date_range('1996-02-29', periods=292, freq='M')
    # melt and sort
    temp_melt = pd.melt(temp, id_vars=['Date'], var_name='Zipcode', value_name=val_name)
    temp_sort = temp_melt.sort_values(['Date','Zipcode'])
    # have the Values in float since it has imaginary number
    temp_sort[val_name] = temp_sort[val_name].astype('float64')
    # set the Date as index
    temp_sort.set_index('Date', inplace=True)
    return temp_sort

In [None]:
df1t = data_trans(df1, 'OneBR')
df2t = data_trans(df2, 'TwoBR')
df3t = data_trans(df3, 'ThreeBR')
df4t = data_trans(df4, 'FourBR')
dft = df1t
dft['TwoBR'] = df2t.TwoBR
dft['ThreeBR'] = df3t.ThreeBR
dft['FourBR'] = df4t.FourBR
dft

# Modeling

In [None]:
# set the endog (training data)
endog = dft.loc['1996-2-29':'2015-05-31']
endog.tail()

In [None]:
# check the index, previously getting warning messages regarding no frequency
endog[endog['Zipcode']==90004].index #need to set the frequency for subsets of endog

In [None]:
# iterate zipcode to subset and model/fit/predict
# create dfs to store values
pred_df1 = pd.DataFrame({'Date':pd.date_range('2015-06-30', periods=60, freq='M')})
pred_df2 = pred_df1
pred_df3 = pred_df1
pred_df4 = pred_df1
# iterate thru zipcode and get the values stored; order=(1,0) trend='n' since the data is stationary
start = time.time()
for zipcode in list(dft.Zipcode.unique()):
    # get the endog by subsetting the dft with specific zipcode and collumns other thatn zipcode
    endog_sub = endog[endog['Zipcode']==zipcode][[i for i in list(dft.columns)][1:]]
    endog_sub.index.freq = 'M' #set the datetime frequency
    # train/fit the model
    model = sm.tsa.VARMAX(endog_sub, order=(1,0), trend='n')
    result = model.fit(maxiter=1000, disp=False)
    # forecast
    pred = result.predict(start='2015-06-30', end='2020-05-31')
    # append data to dataframes
    pred_df1[zipcode] = list(pred.OneBR)
    pred_df2[zipcode] = list(pred.TwoBR)
    pred_df3[zipcode] = list(pred.ThreeBR)
    pred_df4[zipcode] = list(pred.FourBR)
# stop timer
end = time.time()
fit_time = (end-start)
#check the fit time in min
int(fit_time/60)

In [None]:
# check what the prediction dataframes look like
display(pred_df1.head(2))
display(pred_df1.tail(2))

# Transform predictions back to original format

In [None]:
# create a function to inverse the diff by having the first value
def diff_inv(series_diff, first_value):
    series = np.r_[first_value, series_diff].cumsum().astype('float64')
    return series

In [None]:
# test the function above
x = pd.DataFrame({'test':[3,7,2,6,8,0,5,0,4,9,5]})
x['diff'] = x.diff()
x_diff = x['diff'].dropna() # x_diff is a dataframe with single column
display(diff_inv(x_diff, 3))

In [None]:
test = df1[df1['Zipcode']==90004]
test.iloc[-61] # we need 2015-05-31's Value

In [None]:
# define a function that will revert back the prediction values
def pred_trans(pred_df, original_df, val_name):
    temp = pd.DataFrame({}) #create dfs to store values
    restored = pd.DataFrame({'Date':pd.date_range('2015-05-31', periods=61, freq='M')})
    for col in list(pred_df.columns)[1:]:
        # inverse the diff()
        original_df_sub = original_df[original_df['Zipcode']==col]
        first_value = original_df_sub.iloc[-61].Value
        restored[col] = diff_inv(temp[col], first_value)
    # melt
    temp_melt = pd.melt(restored, id_vars=['Date'], var_name='Zipcode', value_name=val_name)
    # make sure date is datetime
    temp_melt['Date'] = pd.to_datetime(temp_melt.Date)
    # sort
    temp_sort = temp_melt.sort_values(['Date','Zipcode'])
    # have the Values in int so it's easier to read and compared to original values
    temp_sort[val_name] = temp_sort[val_name].astype('int64')
    return temp_sort

In [None]:
# transform and compile the prediction
pred_dft = pred_trans(pred_df1, df1, 'OneBR')
pred_dft2 = pred_trans(pred_df2, df2, 'TwoBR')
pred_dft3 = pred_trans(pred_df3, df3, 'ThreeBR')
pred_dft4 = pred_trans(pred_df4, df4, 'FourBR')
pred_dft['TwoBR'] = pred_dft2.TwoBR
pred_dft['ThreeBR'] = pred_dft3.ThreeBR
pred_dft['FourBR'] = pred_dft4.FourBR
pred_dft.head()

In [None]:
# save the restored prediciton to csv file
pred_dft.to_csv('varmax_pred5Dn.csv', index=False)

In [None]:
# load the transformed data
pred_dft = pd.read_csv('varmax_pred5Dn.csv')
pred_dft['Date'] = pd.to_datetime(pred_dft.Date) # make sure Date column is datetime!
pred_dft.head()

In [None]:
# let's try plotting the mean/medians of test and pred, we will need to define functions
# define a function that takes in the prediction dataframe and return the medians
def median_calc(data, nobs, val_column):
    p_medians = []
    for i in range(nobs): 
        median = data.sort_values(['Date',val_column]).iloc[240+481*i][val_column]
        p_medians.append(int(median))
    return p_medians
# define a function that takes in the prediction dataframe and return the means
def mean_calc(data, val_column):
    p_means = []
    for i in list(data.Date.unique()): # get the dates to iterate
        mean = data[data['Date']==i][val_column].mean() # get the mean
        p_means.append(int(mean))
    return p_means

In [None]:
# load the z_avg csv which has zillow's averages
z_avg = pd.read_csv('z_avg.csv')
z_avg['Date'] = pd.to_datetime(z_avg.Date)
z_avg.head()

In [None]:
# let's compiles the means and medians to form a new dataframe and plot on seaborn
z_df = pd.DataFrame({'Date':pd.date_range('1996-01-31', periods=293, freq='M')})
z_df['z_median_OneBR'] = z_avg.median1
z_df['z_median_TwoBR'] = z_avg.median2
z_df['z_median_ThreeBR'] = z_avg.median3
z_df['z_median_FourBR'] = z_avg.median4
z_df['z_mean_OneBR'] = z_avg.mean1
z_df['z_mean_TwoBR'] = z_avg.mean2
z_df['z_mean_ThreeBR'] = z_avg.mean3
z_df['z_mean_FourBR'] = z_avg.mean4
# do the same for prediction
pred_date_num = pred_dft.Date.nunique()
p_df = pd.DataFrame({'Date':pd.date_range('2015-05-31', periods=pred_date_num, freq='M')})
p_df['p_median_OneBR'] = median_calc(pred_dft, pred_date_num, 'OneBR')
p_df['p_median_TwoBR'] = median_calc(pred_dft, pred_date_num, 'TwoBR')
p_df['p_median_ThreeBR'] = median_calc(pred_dft, pred_date_num, 'ThreeBR')
p_df['p_median_FourBR'] = median_calc(pred_dft, pred_date_num, 'FourBR')
p_df['p_mean_OneBR'] = mean_calc(pred_dft, 'OneBR')
p_df['p_mean_TwoBR'] = mean_calc(pred_dft, 'TwoBR')
p_df['p_mean_ThreeBR'] = mean_calc(pred_dft, 'ThreeBR')
p_df['p_mean_FourBR'] = mean_calc(pred_dft, 'FourBR')

In [None]:
# melt and we need the following columns for seaborn: Date, measure_type, Value, Bedrooms, Source
z_melt = pd.melt(z_df, ['Date'])
p_melt = pd.melt(p_df, ['Date'])
sea_df = pd.concat([z_melt, p_melt])
sea_df.head()

In [None]:
# create a list to add the measure type, bedrooms, and sources
measure_list=[]
for i in sea_df.variable:
    if 'median' in i:
        measure_list.append('median')
    else:
        measure_list.append('mean')
# do the same for number of bedrooms
BR_list=[]
for i in sea_df.variable:
    if 'One' in i:
        BR_list.append(1)
    if 'Two' in i:
        BR_list.append(2)
    if 'Three' in i:
        BR_list.append(3)
    if 'Four' in i:
        BR_list.append(4)
# source        
source_list=[]
for i in sea_df.variable:
    if 'z_m' in i:
        source_list.append('zillow')
    else:
        source_list.append('varmax')
# add the columns to sea_df
sea_df['measure_type'] = measure_list
sea_df['Bedrooms'] = BR_list
sea_df['Source'] = source_list
display(sea_df.head(3))
sea_df.tail(3)

In [None]:
# plot the means using seaborn
sns.set_theme(style="darkgrid")
sea_mean = sea_df[sea_df['measure_type']=='mean'] #subset with only the means
fig, ax = plt.subplots(figsize=(16, 8)) # set the figure size and ax to graph on
sea = sns.lineplot(data=sea_mean, x="Date", y="value", hue="Source", style="Bedrooms", ax=ax)
sea.axes.set_title("Zillow vs VARMAX means",fontsize=18)
sea.set_xlabel("Date",fontsize=14)
sea.set_ylabel("Value (mil)",fontsize=14)
plt.savefig('C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\figures\\Zillow_vs_VARMAX_meansDn.png')

In [None]:
# plot the median using seaborn
sns.set_theme(style="darkgrid")
sea_mean = sea_df[sea_df['measure_type']=='median'] #subset with only the means
fig, ax = plt.subplots(figsize=(16, 8)) # set the figure size and ax to graph on
sea = sns.lineplot(data=sea_mean, x="Date", y="value", hue="Source", style="Bedrooms", ax=ax)
sea.axes.set_title("Zillow vs VARMAX medians",fontsize=18)
sea.set_xlabel("Date",fontsize=14)
sea.set_ylabel("Value (mil)",fontsize=14)
plt.savefig('C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\figures\\Zillow_vs_VARMAX_mediansDn.png')

The predictions for the first 2 years seems okay, we shall compare the model scores based on each 12 months of data.

In [None]:
pred_dft.head(2)

In [None]:
# let's make a test dataframe
def test_data_form(df, BR_num, val_name):
    temp=pd.DataFrame({})
    for zipcode in pred_dft.Zipcode.unique():
        df_sub = df[(df['Zipcode']==zipcode) & (df['Bedrooms']==BR_num)] #subset by zipcode and num of bedroom
        df_sub = df_sub[df_sub['Date']>='2015-05-31'] # get corresponding date range
        temp['Date'] = pd.date_range('2015-05-31', periods=len(df_sub), freq='M')
        temp[zipcode]=list(df_sub.Value)
    # melt
    temp_melt = pd.melt(temp, id_vars=['Date'], var_name='Zipcode', value_name=val_name)
    # make sure date is datetime
    temp_melt['Date'] = pd.to_datetime(temp_melt.Date)
    # sort
    temp_sort = temp_melt.sort_values(['Date','Zipcode'])
    # have the Values in int so it's easier to read and compared to original values
    temp_sort[val_name] = temp_sort[val_name].astype('int64')
    return temp_sort

In [None]:
# make a compiled dataframe
test = test_data_form(df,1, 'OneBR')
test2 = test_data_form(df,2, 'TwoBR')
test3 = test_data_form(df,3, 'ThreeBR')
test4 = test_data_form(df,4, 'FourBR')
test['TwoBR'] = test2.TwoBR
test['ThreeBR'] = test3.ThreeBR
test['FourBR'] = test4.FourBR
test.head()

In [None]:
pred_dft.set_index('Date',inplace=True)
test.set_index('Date',inplace=True)

In [None]:
# calculate the MAEs and store to csv
model_list=[]
MAE_list=[]
for i in range(0,5):
    test_sub = test.iloc[i*481:(1+i)*481*12]
    pred_sub = pred_dft.iloc[i*481:(1+i)*481*12]
    model_list.append('VARMAX5')
    MAE = mean_absolute_error(test_sub, pred_sub)
    MAE_list.append(MAE)

In [None]:
model_score = pd.DataFrame({
    'Model': model_list,
    'Cycle': ['15to16', '16to17', '17to18','18to19','19to20'],
    'MAE':MAE_list
})
model_score