In [4]:
import requests
import json
from datetime import datetime
import pandas as pd
import warnings
import os

warnings.filterwarnings('ignore')

In [5]:
#url link from - https://www.redfin.com/news/data-center/
#reads in national zip code level real estate market data
#Duration period: 90

url = 'https://redfin-public-data.s3.us-west-2.amazonaws.com/redfin_market_tracker/zip_code_market_tracker.tsv000.gz'


#assign to dataframe
df = pd.read_csv(url, compression='gzip', sep='\t', on_bad_lines='skip')

print('Num of rows:', len(df))
print('Num of cols:', len(df.columns)) 

#preview data
df.head()

Num of rows: 6680997
Num of cols: 58


Unnamed: 0,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,state,...,sold_above_list_yoy,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated
0,2018-11-01,2019-01-31,90,zip code,2,18999,f,Zip Code: 45226,,Ohio,...,0.0,0.5,0.071429,0.166667,0.0,0.0,0.0,"Cincinnati, OH",17140,2022-11-20 14:36:43
1,2020-12-01,2021-02-28,90,zip code,2,3606,f,Zip Code: 11003,,New York,...,0.183093,0.339286,0.028941,-0.026099,0.428571,0.193277,0.345238,"Nassau County, NY",35004,2022-11-20 14:36:43
2,2015-07-01,2015-09-30,90,zip code,2,2151,f,Zip Code: 06107,,Connecticut,...,0.107143,,,,0.5,,0.5,"Hartford, CT",25540,2022-11-20 14:36:43
3,2017-03-01,2017-05-31,90,zip code,2,38575,f,Zip Code: 93063,,California,...,-0.008333,0.529412,0.029412,0.003096,0.444444,0.019444,-0.007937,"Oxnard, CA",37100,2022-11-20 14:36:43
4,2013-03-01,2013-05-31,90,zip code,2,39921,f,Zip Code: 95938,,California,...,-0.282828,,,,0.25,-0.083333,0.25,"Chico, CA",17020,2022-11-20 14:36:43


In [6]:
# read US zip code data file
# source: https://github.com/zauberware/postal-codes-json-xml-csv/tree/master/data
# Updated 10 months ago

zip_df = pd.read_csv('data\zipcodes.csv')
texas_zip = zip_df.loc[zip_df['state'] == 'Texas']
print('Num of rows: ', len(texas_zip))
print('Num of columns: ', len(texas_zip.columns))

texas_zip.head()

Num of rows:  2600
Num of columns:  11


Unnamed: 0,country_code,zipcode,place,state,state_code,province,province_code,community,community_code,latitude,longitude
33846,US,75763,Frankston,Texas,TX,Anderson,1.0,,,32.0535,-95.5163
33847,US,75779,Neches,Texas,TX,Anderson,1.0,,,31.8668,-95.4958
33848,US,75801,Palestine,Texas,TX,Anderson,1.0,,,31.7588,-95.6342
33849,US,75802,Palestine,Texas,TX,Anderson,1.0,,,31.7621,-95.6308
33850,US,75803,Palestine,Texas,TX,Anderson,1.0,,,31.7571,-95.6545


In [7]:
#Filter data and clean-up

#filter real estate dataset on Texas
texas_df = df.loc[df['state'] == 'Texas']

#drop columns we don't need
texas_df.drop(texas_df.columns[[2, 3, 4, 5, 6, 8, 37, 38, 39]], axis=1, inplace=True)

#region column isolate zipcode into new column 'zipcode'
texas_df['zipcode'] = texas_df.apply(lambda x: x['region'].split(':')[1].strip(), axis=1)
texas_df['zipcode'] = texas_df['zipcode'].astype(int)

#should check that zip codes are valid and that when we merge/join we aren't losing any data
#looks like around 1000 rows are dropped after the merge

#merge zip code data to add zipcode, city, county fields to original Redfin Real estate data. 
texas_merge_df = pd.merge(
    texas_df,
    texas_zip[['zipcode','place','province']],
    how='inner',
    on=['zipcode']
)

#rename columns
rename_cols = {'place': 'city',
        'province': 'county'}

texas_merge_df.rename(columns=rename_cols,
          inplace=True)

#preview
print('Num of rows:', len(texas_merge_df))
texas_merge_df.head()

#csv export
#texas_merge_df.to_csv('redfin_texas_export_new.csv')

Num of rows: 354096


Unnamed: 0,period_begin,period_end,region,state,state_code,property_type,property_type_id,median_sale_price,median_sale_price_mom,median_sale_price_yoy,...,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated,zipcode,city,county
0,2016-08-01,2016-10-31,Zip Code: 77067,Texas,TX,All Residential,-1,130000.0,0.027668,0.141352,...,-0.435606,0.833333,0.033333,0.333333,"Houston, TX",26420,2022-11-20 14:36:43,77067,Houston,Harris
1,2015-09-01,2015-11-30,Zip Code: 77067,Texas,TX,All Residential,-1,118000.0,0.035996,0.092593,...,-0.019608,0.5,0.0,-0.045455,"Houston, TX",26420,2022-11-20 14:36:43,77067,Houston,Harris
2,2012-04-01,2012-06-30,Zip Code: 77067,Texas,TX,All Residential,-1,61375.0,0.006148,0.062311,...,,0.4375,0.346591,0.0375,"Houston, TX",26420,2022-11-20 14:36:43,77067,Houston,Harris
3,2018-12-01,2019-02-28,Zip Code: 77067,Texas,TX,All Residential,-1,145950.0,-0.006467,-0.026351,...,,0.375,-0.125,0.175,"Houston, TX",26420,2022-11-20 14:36:43,77067,Houston,Harris
4,2015-11-01,2016-01-31,Zip Code: 77067,Texas,TX,All Residential,-1,125000.0,-0.003984,0.210654,...,0.127155,0.375,0.193182,0.208333,"Houston, TX",26420,2022-11-20 14:36:43,77067,Houston,Harris


In [45]:
texas_merge_df.info()
texas_merge_df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 355634 entries, 72 to 6680992
Data columns (total 50 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   period_begin                    355634 non-null  object 
 1   period_end                      355634 non-null  object 
 2   region                          355634 non-null  object 
 3   state                           355634 non-null  object 
 4   state_code                      355634 non-null  object 
 5   property_type                   355634 non-null  object 
 6   property_type_id                355634 non-null  int64  
 7   median_sale_price               355634 non-null  float64
 8   median_sale_price_mom           339045 non-null  float64
 9   median_sale_price_yoy           308643 non-null  float64
 10  median_list_price               333142 non-null  float64
 11  median_list_price_mom           316512 non-null  float64
 12  median_list_pr

Unnamed: 0,property_type_id,median_sale_price,median_sale_price_mom,median_sale_price_yoy,median_list_price,median_list_price_mom,median_list_price_yoy,median_ppsf,median_ppsf_mom,median_ppsf_yoy,...,sold_above_list,sold_above_list_mom,sold_above_list_yoy,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region_metro_code
count,355634.0,355634.0,339045.0,308643.0,333142.0,316512.0,293745.0,355443.0,338866.0,308506.0,...,355634.0,339045.0,308643.0,240881.0,222011.0,198020.0,275829.0,243853.0,235542.0,355634.0
mean,3.853937,237999.1,0.034234,0.189473,255272.2,0.059348,0.201837,228.9038,0.251553,0.164725,...,0.19539,0.000908,0.025916,0.551358,0.008151,0.012698,0.340675,-0.000158,0.020618,25065.305325
std,4.415965,190700.7,0.652228,1.463891,194791.0,3.34699,4.372476,34959.47,128.810741,5.53705,...,0.225558,0.122052,0.231852,0.221536,0.170467,0.230267,0.276738,0.298809,0.304436,9366.077954
min,-1.0,700.0,-0.996642,-0.996526,550.0,-0.998206,-0.998741,0.3731579,-0.999699,-0.999028,...,0.0,-1.0,-1.0,0.002674,-0.925,-0.969697,0.0,-1.333333,-2.0,10180.0
25%,-1.0,132000.0,-0.022105,-0.011628,144750.0,-0.026846,-0.011351,80.48938,-0.012462,0.010552,...,0.0,-0.018433,-0.039382,0.4,-0.071429,-0.111111,0.121212,-0.116667,-0.105907,19124.0
50%,4.0,197250.0,0.0,0.089474,212450.0,0.0,0.080432,110.7078,0.0,0.084252,...,0.142857,0.0,0.0,0.53125,0.003415,0.010251,0.333333,0.0,0.0,23104.0
75%,6.0,293000.0,0.039179,0.222824,312400.0,0.042802,0.198462,149.7586,0.028302,0.189486,...,0.279052,0.020816,0.090909,0.692308,0.089831,0.138235,0.5,0.115385,0.156566,26420.0
max,13.0,12375000.0,189.0,263.623955,9438000.0,1179.904523,759.0,12033000.0,74950.076824,2779.607143,...,1.0,1.0,1.0,1.0,0.95082,0.983607,2.0,1.333333,1.333333,49820.0


In [None]:
#Do some discovery of data here

#Why isn't City Populated?
#perhaps we can pull a lit of zip codes for dallas, austin, houston and filter on those zip codes and populate those cities. 


#might be because we're pulling in quarterly data, try exporting monthly supply from site and importing through CSV?

#supply collumns have no data should drop them
#months_of_supply
#months_of_supply_mom
#months_of_supply_yoy

#Last updated = 11/20/2022 
#can likely pull in data through end of november if we update it




In [None]:
texas_merge_df.to_csv('redfin_texas_export_new.csv')