# Capstone 3 Modeling

**The Data Science Method**  

1.   Problem Identification 

2.   Data Wrangling 
  * Data Collection 
   * Data Organization
  * Data Definition 
  * Data Cleaning
 
3.   Exploratory Data Analysis
 * Build data profile tables and plots
        - Outliers & Anomalies
 * Explore data relationships
 * Identification and creation of features

4.   Pre-processing and Training Data Development
  * Create dummy or indicator features for categorical variables
  * Standardize the magnitude of numeric features
  * Split into testing and training datasets
  * Apply scaler to the testing set
  
5.   **Modeling**
  * Fit Models with Training Data Set
  * Review Model Outcomes — Iterate over additional models as needed.
  * Identify the Final Model

6.   Documentation
  * Review the Results
  * Present and share your findings - storytelling
  * Finalize Code 
  * Finalize Documentation

## Data Collection

In [1]:
#load python packages
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.4f' % x) #get rid of scientific notations
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import time
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.api import VAR
from IPython.display import Image
%matplotlib inline

In [2]:
# switch to processed data folder
os.chdir('C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\data\\processed\\')
os.getcwd()

'C:\\Users\\tc18f\\Desktop\\springboard\\Capstone Three\\data\\processed'

In [3]:
# load the combined csv file
df = pd.read_csv('combined.csv')
df['Date'] = pd.to_datetime(df.Date)
df.head()

Unnamed: 0,Zipcode,County,Date,Value,Bedrooms
0,94109,San Francisco County,1996-01-31,263374,1
1,90250,Los Angeles County,1996-01-31,184141,1
2,90046,Los Angeles County,1996-01-31,119677,1
3,94501,Alameda County,1996-01-31,119742,1
4,94110,San Francisco County,1996-01-31,221428,1


In [4]:
# check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563732 entries, 0 to 563731
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   Zipcode   563732 non-null  int64         
 1   County    563732 non-null  object        
 2   Date      563732 non-null  datetime64[ns]
 3   Value     563732 non-null  int64         
 4   Bedrooms  563732 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 21.5+ MB


In [5]:
# break the df into 4 of them indicating the number of bedrooms and drop the county and bedrooms column
df1 = df.loc[df['Bedrooms']==1]
df1 = df1.sort_values(['Date','Zipcode'])
df2 = df.loc[df['Bedrooms']==2]
df2 = df2.sort_values(['Date','Zipcode'])
df3 = df.loc[df['Bedrooms']==3]
df3 = df3.sort_values(['Date','Zipcode'])
df4 = df.loc[df['Bedrooms']==4]
df4 = df4.sort_values(['Date','Zipcode'])
df1.tail() # previous when the date wasn't in date time and sort by date will see 2019 in tail()

Unnamed: 0,Zipcode,County,Date,Value,Bedrooms
140604,95821,Sacramento County,2020-05-31,159237,1
140568,95822,Sacramento County,2020-05-31,301542,1
140598,95825,Sacramento County,2020-05-31,169082,1
140808,96150,El Dorado County,2020-05-31,321712,1
140837,96161,Nevada County,2020-05-31,346110,1


# Preprocess for VAR model
need the values to be differenced first then 5th root it

In [6]:
# define a function that takes in the df, and the number of diff value and root value (needs to be odd numbers)
def data_trans(df, val_name):
    temp = pd.DataFrame({})
    # add Zipcode as columns and its values to temp
    for zipcode in list(df.Zipcode.unique()):
        temp[zipcode] = list(df[df['Zipcode']==zipcode].Value)
    # difference the data
    temp = temp.diff().dropna()
    # get make the dataset cube root
    for zipcode in list(df.Zipcode.unique()):
        temp[zipcode] = temp[zipcode].apply(lambda x: (x**(1/5)))
    # add Date column to so we can melt it, starting date is 1996-2-29 periods=293 since it's differenced and lost 1 month
    temp['Date'] = pd.date_range('1996-02-29', periods=292, freq='M')
    # melt and sort
    temp_melt = pd.melt(temp, id_vars=['Date'], var_name='Zipcode', value_name=val_name)
    temp_sort = temp_melt.sort_values(['Date','Zipcode'])
    # have the Values in float since it has imaginary number
    temp_sort[val_name] = temp_sort[val_name].astype('float64')
    # set the Date as index
    temp_sort.set_index('Date', inplace=True)
    return temp_sort

In [7]:
df1t = data_trans(df1, 'OneBR')
df2t = data_trans(df2, 'TwoBR')
df3t = data_trans(df3, 'ThreeBR')
df4t = data_trans(df4, 'FourBR')
dft = df1t
dft['TwoBR'] = df2t.TwoBR
dft['ThreeBR'] = df3t.ThreeBR
dft['FourBR'] = df4t.FourBR
dft

Unnamed: 0_level_0,Zipcode,OneBR,TwoBR,ThreeBR,FourBR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1996-02-29,90004,3.1354,2.7693,3.4461,3.5714
1996-02-29,90007,2.4601,2.9041,2.4769,2.5660
1996-02-29,90012,4.6568,4.4153,3.0385,2.5317
1996-02-29,90016,2.8302,2.4271,2.3639,2.8742
1996-02-29,90018,3.1029,2.1482,2.3793,2.9118
...,...,...,...,...,...
2020-05-31,95821,3.7342,2.2526,2.8560,3.2543
2020-05-31,95822,3.3227,3.1518,3.3145,2.9247
2020-05-31,95825,3.2103,1.5522,2.8972,3.6416
2020-05-31,96150,3.4668,2.8083,3.4158,3.6028


# Modeling

In [8]:
# make an empty list to store vectors
data = []
for i in range(len(dft)):
    vi = dft.iloc[i, 1:5]
    v1 = vi[0]
    v2 = vi[1]
    v3 = vi[2]
    v4 = vi[3]
    row = [v1, v2, v3, v4]
    data.append(row)
data

[[3.1353680298988067, 2.769345978400522, 3.446095064991105, 3.571373127421423],
 [2.4600974689967763,
  2.904068770293588,
  2.4768978751793362,
  2.566011811151251],
 [4.656813260660721,
  4.4152814358578105,
  3.0385049823742345,
  2.531667508315967],
 [2.830244153807066,
  2.4270509831248424,
  2.363924301434955,
  2.8742261136118117],
 [3.10293917928743, 2.148188715180851, 2.3792615198800977, 2.9118234034813653],
 [3.570142891811664,
  2.2901720489235826,
  2.6307168652587087,
  1.9236247802355624],
 [3.3548861445089018,
  2.161053040498814,
  4.994226682909546,
  2.4428896557373942],
 [3.0176237148721055,
  2.5875669644085733,
  2.734880068516037,
  2.151433728038076],
 [3.5063492669203193,
  3.647930063200498,
  3.2525222540806205,
  2.752647584164982],
 [3.3964692493056643,
  2.3412988519361133,
  2.852938178386769,
  3.6833750226130317],
 [2.557967728630044,
  2.7330537939940136,
  2.8681063431328075,
  1.5157165665103982],
 [2.6712084610391402,
  3.440403712589018,
  3.9118220

In [9]:
# fit model
model = VAR(data)
model_fit = model.fit(12)
# make prediction
pred = model_fit.forecast(model_fit.y, steps=60*481)
len(pred)

28860

In [10]:
pred[0]

array([3.70794071, 3.50569951, 3.66575176, 3.89417352])

In [11]:
pred[0][0]

3.707940713098164

In [12]:
# make the prediciton into a dataframe
BR1=[]
BR2=[]
BR3=[]
BR4=[]
for i in range(481*60):
    BR1.append(pred[i][0])
    BR2.append(pred[i][1])
    BR3.append(pred[i][2])
    BR4.append(pred[i][3])
pred_df = pd.DataFrame({
#    'Date': list(df.Date)[-(481*60):],
    'Zipcode':list(dft.Zipcode)[:481*60],
    'OneBR':BR1,
    'TwoBR':BR2,
    'ThreeBR':BR3,
    'FourBR':BR4,
})
pred_df.head()

Unnamed: 0,Zipcode,OneBR,TwoBR,ThreeBR,FourBR
0,90004,3.7079,3.5057,3.6658,3.8942
1,90007,3.6277,3.4884,3.635,3.837
2,90012,3.6384,3.5326,3.6742,3.8559
3,90016,3.6139,3.5101,3.6771,3.8512
4,90018,3.6154,3.537,3.6961,3.9133


# Transform predictions back to original format

In [13]:
# inverse the 1/5 root
pred_df5 =pred_df.apply(lambda x: (x**5))
pred_df5['Zipcode'] = list(dft.Zipcode)[:481*60]
pred_df5.head()

Unnamed: 0,Zipcode,OneBR,TwoBR,ThreeBR,FourBR
0,90004,700.9127,529.5091,661.9349,895.5225
1,90007,628.292,516.5646,634.6188,831.7075
2,90012,637.6479,550.1277,669.6194,852.3299
3,90016,616.4645,532.8187,672.2398,847.207
4,90018,617.6853,553.5786,689.8036,917.7269


In [14]:
# create a function to inverse the diff by having the first value
def diff_inv(series_diff, first_value):
    series = np.r_[first_value, series_diff].cumsum().astype('float64')
    return series

In [15]:
# inverse the diff()
Res1=pd.DataFrame({})
Res2=Res1
Res3=Res1
Res4=Res1
for zipcode in list(pred_df5.Zipcode.unique()):
    sub = pred_df5[pred_df5['Zipcode']==zipcode]
    Res1[zipcode] = diff_inv(sub.OneBR, list(df1[df1['Zipcode']==zipcode].Value)[-61])
    Res2[zipcode] = diff_inv(sub.TwoBR, list(df2[df2['Zipcode']==zipcode].Value)[-61])
    Res3[zipcode] = diff_inv(sub.ThreeBR, list(df3[df3['Zipcode']==zipcode].Value)[-61])
    Res4[zipcode] = diff_inv(sub.FourBR, list(df4[df4['Zipcode']==zipcode].Value)[-61])

In [21]:
diff_inv(pred_df5[pred_df5['Zipcode']==90004].OneBR, list(df1[df1['Zipcode']==90004].Value))[-61]

116758721.0

In [16]:
Res1

Unnamed: 0,90004,90007,90012,90016,90018,90019,90020,90022,90023,90024,...,95709,95722,95726,95818,95819,95821,95822,95825,96150,96161
0,989347.0000,537158.0000,586078.0000,613194.0000,549588.0000,926055.0000,1327119.0000,410150.0000,375076.0000,2247671.0000,...,452045.0000,582435.0000,349172.0000,421196.0000,406748.0000,351642.0000,265556.0000,285573.0000,552845.0000,889668.0000
1,990242.5225,537989.7075,586930.3299,614041.2070,550505.7269,927007.1187,1328038.9379,411074.7906,375981.8978,2248598.7144,...,454971.5209,585361.5430,352098.5648,424122.5862,409674.6073,354568.6281,268482.6484,288499.6685,555771.6882,892594.7076
2,993169.2492,540916.4529,589857.0938,616967.9890,553432.5268,929933.9362,1330965.7726,414001.6423,378908.7662,2251525.5993,...,457899.3856,588289.4078,355026.4296,427050.4510,412602.4721,357496.4928,271410.5132,291427.5333,558699.5530,895522.5724
3,996097.1140,543844.3177,592784.9586,619895.8538,556360.3916,932861.8010,1333893.6374,416929.5072,381836.6311,2254453.4641,...,460827.2508,591217.2730,357954.2948,429978.3162,415530.3373,360424.3580,274338.3784,294355.3985,561627.4182,898450.4376
4,999024.9792,546772.1829,595712.8238,622823.7190,559288.2568,935789.6662,1336821.5026,419857.3724,384764.4963,2257381.3293,...,463755.1160,594145.1382,360882.1600,432906.1814,418458.2025,363352.2232,277266.2436,297283.2637,564555.2834,901378.3028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1151273.9696,699021.1733,747961.8142,775072.7094,711537.2472,1088038.6566,1489070.4930,572106.3627,537013.4866,2409630.3197,...,616004.1064,746394.1286,513131.1504,585155.1718,570707.1929,515601.2136,429515.2340,449532.2541,716804.2738,1053627.2932
57,1154201.8348,701949.0385,750889.6794,778000.5746,714465.1124,1090966.5218,1491998.3582,575034.2279,539941.3518,2412558.1849,...,618931.9716,749321.9938,516059.0156,588083.0370,573635.0581,518529.0788,432443.0992,452460.1193,719732.1390,1056555.1584
58,1157129.7000,704876.9037,753817.5446,780928.4398,717392.9776,1093894.3870,1494926.2234,577962.0931,542869.2170,2415486.0501,...,621859.8368,752249.8590,518986.8808,591010.9022,576562.9233,521456.9440,435370.9644,455387.9845,722660.0042,1059483.0236
59,1160057.5652,707804.7689,756745.4098,783856.3050,720320.8428,1096822.2522,1497854.0886,580889.9583,545797.0822,2418413.9153,...,624787.7020,755177.7242,521914.7460,593938.7674,579490.7885,524384.8092,438298.8296,458315.8497,725587.8694,1062410.8888


In [18]:
df1[df1['Zipcode']==90004].tail()

Unnamed: 0,Zipcode,County,Date,Value,Bedrooms
138550,90004,Los Angeles County,2020-01-31,651501,1
139031,90004,Los Angeles County,2020-02-29,654033,1
139512,90004,Los Angeles County,2020-03-31,657105,1
139993,90004,Los Angeles County,2020-04-30,660008,1
140474,90004,Los Angeles County,2020-05-31,661912,1


Obviously the results are way off