In [118]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
import os
import pickle
%matplotlib inline

In [119]:
def is_nan(x):
    return (x is np.nan or x != x)

In [120]:
#import data
bed_types = ['Studio', '1Bedroom', '3Bedroom', '4Bedroom', '5BedroomOrMore']
#bed_types = ['Studio', '1Bedroom', '3Bedroom']

attr_df = pd.DataFrame(columns=['RegionName', 'City', 'State', 'Metro', 'CountyName', 'SizeRank','BedSize', 'Year','Month', 'Amount'])
attr_df.head()


Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,BedSize,Year,Month,Amount


In [125]:
for index, term in enumerate(bed_types):
    #print(term)
    temp_df=pd.read_csv('data/Apartment/Zip_MedianRentalPrice_{0}.csv'.format(term))
    temp_df = temp_df[temp_df['City'] == 'Seattle']
    print(temp_df)
    for index, row in temp_df.iterrows():
        for col in temp_df.columns:
            if col.find("-") > -1 and not is_nan(row[col]):
                split_str = col.split('-')
                temp_amount = row[col]
                new_row = row.iloc[0:6]
                new_row['Year'] = split_str[0]
                new_row['Month'] = split_str[1]
                new_row['Amount'] = temp_amount
                new_row['BedSize'] = term
                
                attr_df = attr_df.append(new_row)

     RegionName     City State    Metro CountyName  SizeRank  2010-03  \
70        98103  Seattle    WA  Seattle       King        71      NaN   
156       98122  Seattle    WA  Seattle       King       157      NaN   
251       98109  Seattle    WA  Seattle       King       252      NaN   
268       98105  Seattle    WA  Seattle       King       269      NaN   
327       98102  Seattle    WA  Seattle       King       328      NaN   
445       98107  Seattle    WA  Seattle       King       446      NaN   
513       98121  Seattle    WA  Seattle       King       514      NaN   
663       98101  Seattle    WA  Seattle       King       664      NaN   
733       98104  Seattle    WA  Seattle       King       734      NaN   

     2010-04  2010-05  2010-06   ...     2017-05  2017-06  2017-07  2017-08  \
70       NaN      NaN      NaN   ...      1595.0   1660.0   1395.0   1395.0   
156      NaN      NaN      NaN   ...      1495.0   1627.5   1627.5   1612.5   
251      NaN      NaN      NaN  

In [127]:
attr_df.head()

Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,BedSize,Year,Month,Amount
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,1,1457.0
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,2,1482.5
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,3,1275.0
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,4,1385.0
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,5,1595.0


In [123]:
with open('apartment_sale.pkl', 'wb') as picklefile:
    pickle.dump(attr_df, picklefile)

In [126]:
with open("apartment_sale.pkl", 'rb') as picklefile: 
    my_old_data = pickle.load(picklefile)
my_old_data.head()

Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,BedSize,Year,Month,Amount
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,1,1457.0
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,2,1482.5
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,3,1275.0
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,4,1385.0
70,98103,Seattle,WA,Seattle,King,71,Studio,2017,5,1595.0


# Processing Housing Sale Data From Zilliow

In [130]:
housing_temp_df=pd.read_csv('data/House/Zillow_Sale_Prices_City.csv')
housing_temp_df.columns

Index(['RegionID', 'RegionName', 'StateName', 'SizeRank', '2008-03', '2008-04',
       '2008-05', '2008-06', '2008-07', '2008-08',
       ...
       '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10',
       '2017-11', '2017-12', '2018-01', '2018-02'],
      dtype='object', length=124)

In [134]:
def ProcessingHousingData():
    attr_housing_df = pd.DataFrame(columns=['RegionID', 'RegionName', 'StateName', 'SizeRank', 'Year','Month', 'Amount'])
    temp_df=pd.read_csv('data/House/Zillow_Sale_Prices_City.csv')
    temp_df = temp_df[temp_df['RegionName'] == 'Seattle']
    print(temp_df)
    for index, row in temp_df.iterrows():
        for col in temp_df.columns:
            if col.find("-") > -1 and not is_nan(row[col]):
                split_str = col.split('-')
                temp_amount = row[col]
                new_row = row.iloc[0:4]
                new_row['Year'] = split_str[0]
                new_row['Month'] = split_str[1]
                new_row['Amount'] = temp_amount
                
                attr_housing_df = attr_housing_df.append(new_row)
    return attr_housing_df

In [143]:
zillow_housing_df = ProcessingHousingData()
zillow_housing_df.head()

    RegionID RegionName   StateName  SizeRank   2008-03   2008-04   2008-05  \
17   16037.0    Seattle  Washington        18  421300.0  424000.0  426000.0   

     2008-06   2008-07   2008-08    ...      2017-05   2017-06   2017-07  \
17  422900.0  414400.0  407800.0    ...     621800.0  637900.0  654100.0   

     2017-08   2017-09   2017-10   2017-11   2017-12   2018-01   2018-02  
17  667000.0  674400.0  671400.0  676000.0  677500.0  697000.0  717800.0  

[1 rows x 124 columns]


Unnamed: 0,RegionID,RegionName,StateName,SizeRank,Year,Month,Amount
17,16037.0,Seattle,Washington,18,2008,3,421300.0
17,16037.0,Seattle,Washington,18,2008,4,424000.0
17,16037.0,Seattle,Washington,18,2008,5,426000.0
17,16037.0,Seattle,Washington,18,2008,6,422900.0
17,16037.0,Seattle,Washington,18,2008,7,414400.0


In [138]:
with open('zillow_housing.pkl', 'wb') as picklefile:
    pickle.dump(zillow_housing_df, picklefile)

In [139]:
with open("zillow_housing.pkl", 'rb') as picklefile: 
    my_old_zillow_housing_df = pickle.load(picklefile)
my_old_zillow_housing_df.head()

Unnamed: 0,RegionID,RegionName,StateName,SizeRank,Year,Month,Amount
17,16037.0,Seattle,Washington,18,2008,3,421300.0
17,16037.0,Seattle,Washington,18,2008,4,424000.0
17,16037.0,Seattle,Washington,18,2008,5,426000.0
17,16037.0,Seattle,Washington,18,2008,6,422900.0
17,16037.0,Seattle,Washington,18,2008,7,414400.0
