# Capstone 3 finalizing data for tableau

In [1]:
#load python packages
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.4f' % x) #get rid of scientific notations
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import time
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.api import VAR
from IPython.display import Image
%matplotlib inline

In [2]:
# switch to processed data folder
os.chdir('C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\data\\processed\\')
os.getcwd()

'C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\data\\processed'

In [3]:
# load the combined csv file
df = pd.read_csv('combined.csv')
df['Date'] = pd.to_datetime(df.Date)
df.head()

Unnamed: 0,Zipcode,County,Date,Value,Bedrooms
0,94109,San Francisco County,1996-01-31,263374,1
1,90250,Los Angeles County,1996-01-31,184141,1
2,90046,Los Angeles County,1996-01-31,119677,1
3,94501,Alameda County,1996-01-31,119742,1
4,94110,San Francisco County,1996-01-31,221428,1


In [4]:
# load the prediction csv file
pred = pd.read_csv('varmax_pred5D5R.csv')
pred['Date'] = pd.to_datetime(pred.Date)
pred.head()

Unnamed: 0,Date,Zipcode,OneBR,TwoBR,ThreeBR,FourBR
0,2015-05-31,90004,503131,657457,676793,989347
1,2015-05-31,90007,411202,411318,477961,537158
2,2015-05-31,90012,440790,463270,565995,586078
3,2015-05-31,90016,350886,452026,528033,613194
4,2015-05-31,90018,392959,442982,473918,549588


In [5]:
# load the forecast csv file
fc = pd.read_csv('varmax_fc.csv')
fc['Date'] = pd.to_datetime(fc.Date)
fc.head()

Unnamed: 0,Date,Zipcode,OneBR,TwoBR,ThreeBR,FourBR
0,2020-05-31,90004,661912,960478,1335291,2193740
1,2020-05-31,90007,556480,659846,759842,876513
2,2020-05-31,90012,517031,662261,767330,852181
3,2020-05-31,90016,529954,756320,910316,977537
4,2020-05-31,90018,578417,741959,798073,872098


In [6]:
# we know the fcs are sorted by Date then Zipcode, let's do the same with subset of df1 (bedrooms==1)
# then we can use its county column to add to pred/fc
df1 = df[df['Bedrooms']==1].sort_values(['Date','Zipcode'])
df1.head()

Unnamed: 0,Zipcode,County,Date,Value,Bedrooms
22,90004,Los Angeles County,1996-01-31,139486,1
201,90007,Los Angeles County,1996-01-31,96265,1
203,90012,Los Angeles County,1996-01-31,108276,1
111,90016,Los Angeles County,1996-01-31,103716,1
113,90018,Los Angeles County,1996-01-31,167823,1


We want the pred/forecast in similar format as df so we can concat them and compare

-let's break the dataframes to 4 subsets based on number of bedrooms

-add the Bedrooms column for each subset

-add the County

-concat the fcs with added columns

In [7]:
# make a function that takes the df and the column name for its value and return to similar format with respective BR
def DZB(df, val_col, num_BR):
    df = df[['Date','Zipcode',val_col]] # subset to the respective columns
    df.columns=['Date','Zipcode','Value'] # change the numBR to Value so we can concat them later
    df['Bedrooms'] = num_BR # add the bedrooms column
    df['County'] = list(df1.County)[:len(df)]
    return df

In [8]:
# make a function that takes in the df and return with transformed df that's like the very first df
def df_trans(df):
    temp1 = DZB(df, 'OneBR', 1)
    temp2 = DZB(df, 'TwoBR', 2)
    temp3 = DZB(df, 'ThreeBR', 3)
    temp4 = DZB(df, 'FourBR', 4)
    temp_concat = pd.concat([temp1,temp2,temp3,temp4])
    return temp_concat

In [9]:
pred_t = df_trans(pred)
fct = df_trans(fc)
pred_t

Unnamed: 0,Date,Zipcode,Value,Bedrooms,County
0,2015-05-31,90004,503131,1,Los Angeles County
1,2015-05-31,90007,411202,1,Los Angeles County
2,2015-05-31,90012,440790,1,Los Angeles County
3,2015-05-31,90016,350886,1,Los Angeles County
4,2015-05-31,90018,392959,1,Los Angeles County
...,...,...,...,...,...
29336,2020-05-31,95821,421204,4,Sacramento County
29337,2020-05-31,95822,386475,4,Sacramento County
29338,2020-05-31,95825,304851,4,Sacramento County
29339,2020-05-31,96150,565641,4,El Dorado County


In [10]:
fct

Unnamed: 0,Date,Zipcode,Value,Bedrooms,County
0,2020-05-31,90004,661912,1,Los Angeles County
1,2020-05-31,90007,556480,1,Los Angeles County
2,2020-05-31,90012,517031,1,Los Angeles County
3,2020-05-31,90016,529954,1,Los Angeles County
4,2020-05-31,90018,578417,1,Los Angeles County
...,...,...,...,...,...
29336,2025-05-31,95821,467547,4,Sacramento County
29337,2025-05-31,95822,403489,4,Sacramento County
29338,2025-05-31,95825,426795,4,Sacramento County
29339,2025-05-31,96150,664502,4,El Dorado County


In [11]:
# save the transformed dfs to xlsx so it can be used in tableau, also in csv so it's faster to read/edit in python
pred_t.to_excel('pred_t.xlsx', index=False)
pred_t.to_csv('pred_t.csv', index=False)
fct.to_excel('forecast_t.xlsx', index=False)
fct.to_csv('forecast_t.csv', index=False)

In [12]:
# let's add the source for zillow and forecast
df['Source'] = 'zillow'
pred_t['Source'] = 'prediction'
fct['Source'] = 'forecast'
# concat
combined_df = pd.concat([df, pred_t, fct])
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 798460 entries, 0 to 29340
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   Zipcode   798460 non-null  int64         
 1   County    798460 non-null  object        
 2   Date      798460 non-null  datetime64[ns]
 3   Value     798460 non-null  int64         
 4   Bedrooms  798460 non-null  int64         
 5   Source    798460 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 42.6+ MB


In [13]:
# save the to xlsx so it can be used in tableau, also in csv so it's faster to read/edit in python
combined_df.to_excel('zillow_pred_forecast.xlsx', index=False)
combined_df.to_csv('zillow_pred_forecast.csv', index=False)