In [1]:
import pandas as pd
import nasdaqdatalink as nasdaq
import os

In [2]:
api_key = os.path.join('Resources', 'nasdaq_api_key.txt')

nasdaq.read_key(api_key)

In [3]:
# Reference: https://www.zillow.com/research/data/

zillow_indicators = nasdaq.get_table("ZILLOW/INDICATORS")

indicator_inventory_sales_df = zillow_indicators.loc[zillow_indicators["category"] == "Inventory and sales"]

indicator_inventory_sales_df 

Unnamed: 0_level_0,indicator_id,indicator,category
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,SSSW,"Median Sale Price (Smooth, SFR only, Weekly View)",Inventory and sales
11,SSSM,"Median Sale Price (Smooth, SFR only, Monthly)",Inventory and sales
12,SSAW,"Median Sale Price (Smooth, All Homes, Weekly V...",Inventory and sales
13,SSAM,"Median Sale Price (Smooth, All Homes, Monthly)",Inventory and sales
14,SRSW,"Median Sale Price (Raw, SFR only, Weekly View)",Inventory and sales
15,SRSM,"Median Sale Price (Raw, SFR only, Monthly)",Inventory and sales
16,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales
17,SRAM,"Median Sale Price (Raw, All Homes, Monthly)",Inventory and sales
18,SASW,Median Sale Price (Smooth & Seasonally Adjuste...,Inventory and sales
19,SASM,Median Sale Price (Smooth & Seasonally Adjuste...,Inventory and sales


In [4]:
zillow_regions = nasdaq.get_table("ZILLOW/REGIONS", paginate = True)

zillow_regions.head()

Unnamed: 0_level_0,region_id,region_type,region
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,99999,zip,"98847;WA;Wenatchee, WA;Leavenworth;Chelan County"
1,99998,zip,98846;WA;nan;Pateros;Okanogan County
2,99997,zip,98845; WA; Wenatchee; Douglas County; Palisades
3,99996,zip,98844;WA;nan;Oroville;Okanogan County
4,99995,zip,"98843;WA;Wenatchee, WA;Orondo;Douglas County"


In [6]:
zillow_iraw = nasdaq.get_table("ZILLOW/DATA", indicator_id = 'IRAW', paginate = True)

zillow_iraw

Unnamed: 0_level_0,indicator_id,region_id,date,value
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,IRAW,845172,2023-10-07,138.0
1,IRAW,845172,2023-09-30,141.0
2,IRAW,845172,2023-09-23,140.0
3,IRAW,845172,2023-09-16,145.0
4,IRAW,845172,2023-09-09,144.0
...,...,...,...,...
266886,IRAW,102001,2017-11-04,1308880.0
266887,IRAW,102001,2017-10-28,1314298.0
266888,IRAW,102001,2017-10-21,1330402.0
266889,IRAW,102001,2017-10-14,1335509.0


In [7]:
# Inner merge both datasets into one;
# Only keep rows in the left DataFrame (zillow_iram) where value of 'region_id' exists on both DataFrames
merged_df = pd.merge(zillow_iraw, zillow_regions, how = "left", on = ["region_id"])

merged_df

Unnamed: 0,indicator_id,region_id,date,value,region_type,region
0,IRAW,845172,2023-10-07,138.0,metro,"Winfield, KS"
1,IRAW,845172,2023-09-30,141.0,metro,"Winfield, KS"
2,IRAW,845172,2023-09-23,140.0,metro,"Winfield, KS"
3,IRAW,845172,2023-09-16,145.0,metro,"Winfield, KS"
4,IRAW,845172,2023-09-09,144.0,metro,"Winfield, KS"
...,...,...,...,...,...,...
266886,IRAW,102001,2017-11-04,1308880.0,metro,United States
266887,IRAW,102001,2017-10-28,1314298.0,metro,United States
266888,IRAW,102001,2017-10-21,1330402.0,metro,United States
266889,IRAW,102001,2017-10-14,1335509.0,metro,United States


In [8]:
merged_df[['City', 'State']] = merged_df['region'].str.split(', ', expand = True)

merged_df

Unnamed: 0,indicator_id,region_id,date,value,region_type,region,City,State
0,IRAW,845172,2023-10-07,138.0,metro,"Winfield, KS",Winfield,KS
1,IRAW,845172,2023-09-30,141.0,metro,"Winfield, KS",Winfield,KS
2,IRAW,845172,2023-09-23,140.0,metro,"Winfield, KS",Winfield,KS
3,IRAW,845172,2023-09-16,145.0,metro,"Winfield, KS",Winfield,KS
4,IRAW,845172,2023-09-09,144.0,metro,"Winfield, KS",Winfield,KS
...,...,...,...,...,...,...,...,...
266886,IRAW,102001,2017-11-04,1308880.0,metro,United States,United States,
266887,IRAW,102001,2017-10-28,1314298.0,metro,United States,United States,
266888,IRAW,102001,2017-10-21,1330402.0,metro,United States,United States,
266889,IRAW,102001,2017-10-14,1335509.0,metro,United States,United States,


In [9]:
def print_unique_states(dataframe):
    unique_states = dataframe["State"].unique()
    count_states = dataframe["State"].nunique()

    print(f"There are {count_states} States found in the Merged DataFrame:")

    for state in unique_states :
        print(state)


print_unique_states(merged_df)

There are 66 States found in the Merged DataFrame:
KS
SC
IN
TX
IL
WV
LA
NH
AL
AZ
NY
OH
MS
PA
NM
IA
AR
MN
KY
TN
WI
NV
OK
MA
HI
OR
UT; UT
CO
CA
ID
OH; OH
NC
NC; NC
WA
FL
MI
WV; WV
KS; KS
GA
MI; MI
MA; MA
MD
VA
SD
ND
MO
DC
NJ
UT
AL; AL
CT
WY
NE
VT
RI
AZ; AZ
ME
IL; IL
IA; IA
MT
WI; WI
TX; TX
AK
DE
NH; NH
LA; LA
None


In [10]:
for index, row in merged_df.iterrows() :
    state_element = row["State"]
    
    if state_element is not None and ';' in state_element :
        new_state = state_element.split(';', 1)[0]
        
        merged_df.at[index, "State"] = new_state
        

merged_df = merged_df.dropna(subset = ["State"])

print(f"There are {len(merged_df)} rows in the DataFrame.\n")

print_unique_states(merged_df)

There are 266577 rows in the DataFrame.

There are 51 States found in the Merged DataFrame:
KS
SC
IN
TX
IL
WV
LA
NH
AL
AZ
NY
OH
MS
PA
NM
IA
AR
MN
KY
TN
WI
NV
OK
MA
HI
OR
UT
CO
CA
ID
NC
WA
FL
MI
GA
MD
VA
SD
ND
MO
DC
NJ
CT
WY
NE
VT
RI
ME
MT
AK
DE


In [11]:
columns_with_nan_values = merged_df.isnull().sum()

print(columns_with_nan_values)

print()

merged_df

indicator_id    0
region_id       0
date            0
value           0
region_type     0
region          0
City            0
State           0
dtype: int64



Unnamed: 0,indicator_id,region_id,date,value,region_type,region,City,State
0,IRAW,845172,2023-10-07,138.0,metro,"Winfield, KS",Winfield,KS
1,IRAW,845172,2023-09-30,141.0,metro,"Winfield, KS",Winfield,KS
2,IRAW,845172,2023-09-23,140.0,metro,"Winfield, KS",Winfield,KS
3,IRAW,845172,2023-09-16,145.0,metro,"Winfield, KS",Winfield,KS
4,IRAW,845172,2023-09-09,144.0,metro,"Winfield, KS",Winfield,KS
...,...,...,...,...,...,...,...,...
266572,IRAW,394297,2018-02-10,160.0,metro,"Aberdeen, SD",Aberdeen,SD
266573,IRAW,394297,2018-02-03,157.0,metro,"Aberdeen, SD",Aberdeen,SD
266574,IRAW,394297,2018-01-27,155.0,metro,"Aberdeen, SD",Aberdeen,SD
266575,IRAW,394297,2018-01-20,159.0,metro,"Aberdeen, SD",Aberdeen,SD


In [12]:
merged_df.to_csv('./Resources/iraw_data.csv', index = False)