**Group Members:**
Zixuan Zhou, Xinyue (Yolanda) Pan, Abe, Yining Song

**Code written by:** Zixuan Zhou, Yolanda Pan

In [1]:
import requests
import json
import time
import pandas as pd

In [6]:
# specify the URL and headers
url = "https://zillow56.p.rapidapi.com/search"


headers = {
	"x-rapidapi-key": "543cfe5e40msha66e48862afef4cp137a66jsn75ce65836e39",
	"x-rapidapi-host": "zillow56.p.rapidapi.com"
}

# Retrieve rental listings data

In [21]:
def get_all_pages_for_zipcode(zipcode: str) -> list:
    '''
    Given a zipcode, this function will retrieve all zillow rental listing data for that zipcode.
    '''
    all_properties = []
    page = 1
    
    while True:
        params = {
            "location": zipcode,
            "output": "json",
            "status": "forRent",
            "doz": "any",
            "page": str(page),
            "sortSelection": "days",
        }
        
        try:
            if page > 1:
                time.sleep(0.5)
                
            print(f"Retriving zipcode: {zipcode}, page {page} ...")
            response = requests.get(url, headers=headers, params=params)
            response.raise_for_status()
            
            data = response.json()

            results = data.get("results")
            
            # check if there are any results
            if not results:
                print(f"All data by {zipcode} retrieved. Total page number retrieved: {page-1}.")
                break
            
            # extract properties
            for result in results:
                variables = {
                        "zpid": result.get("zpid"),
                        "longitude": result.get("longitude"),
                        "latitude": result.get("latitude"),
                        "bathrooms": result.get("bathrooms"),
                        "bedrooms": result.get("bedrooms"),
                        "address": result.get("streetAddress"),
                        "price": result.get("price"),
                        "zipcode": result.get("zipcode"),
                        "living_area": result.get("livingArea"),
                        "home_type": result.get("homeType"),
                        "rent_zestimate": result.get("rentZestimate"),
                        "unit": result.get("unit"),
                        "time_on_zillow": result.get("timeOnZillow")
                    }
                all_properties.append(variables)
    

            print(f"Results retrieved: {len(results)}. Total number of results: {len(all_properties)}.")
            
            page += 1
            
        except requests.exceptions.RequestException as e:
            print(f"Cannot retrieve data: {e}")
            break
            
        except Exception as e:
            print(f"Error processing data: {e}")
            break
    
    return all_properties


In [4]:
# test the function
all_listing = get_all_pages_for_zipcode("60610")

Retriving zipcode: 60610, page 1 ...
Results retrieved: 11. Total number of results: 11.
Retriving zipcode: 60610, page 2 ...
Results retrieved: 21. Total number of results: 32.
Retriving zipcode: 60610, page 3 ...
Results retrieved: 24. Total number of results: 56.
Retriving zipcode: 60610, page 4 ...
Results retrieved: 6. Total number of results: 62.
Retriving zipcode: 60610, page 5 ...
Results retrieved: 11. Total number of results: 73.
Retriving zipcode: 60610, page 6 ...
Results retrieved: 11. Total number of results: 84.
Retriving zipcode: 60610, page 7 ...
Results retrieved: 11. Total number of results: 95.
Retriving zipcode: 60610, page 8 ...
Results retrieved: 11. Total number of results: 106.
Retriving zipcode: 60610, page 9 ...
Results retrieved: 11. Total number of results: 117.
Retriving zipcode: 60610, page 10 ...
Results retrieved: 11. Total number of results: 128.
Retriving zipcode: 60610, page 11 ...
Results retrieved: 11. Total number of results: 139.
Retriving zipcod

In [19]:
# gather all chicago zipcodes according to USPS website: https://tools.usps.com/zip-code-lookup.htm?bycitystate#page-1 
chicago_zipcode = ["60693", "60694", "60695", "60696", "60697", "60699", "60701", "60706", "60707", "60803", "60804", "60805", "60827"]

for i in range(60601, 60692):
    chicago_zipcode.append(str(i))

to_remove = ["60627", "60635", "60648", "60650", "60658", "60662", "60663", "60665", "60667", "60671", "60672", "60676", "60679", "60683"]

for tm in to_remove:
    chicago_zipcode.remove(tm)

sorted(chicago_zipcode)


['60601',
 '60602',
 '60603',
 '60604',
 '60605',
 '60606',
 '60607',
 '60608',
 '60609',
 '60610',
 '60611',
 '60612',
 '60613',
 '60614',
 '60615',
 '60616',
 '60617',
 '60618',
 '60619',
 '60620',
 '60621',
 '60622',
 '60623',
 '60624',
 '60625',
 '60626',
 '60628',
 '60629',
 '60630',
 '60631',
 '60632',
 '60633',
 '60634',
 '60636',
 '60637',
 '60638',
 '60639',
 '60640',
 '60641',
 '60642',
 '60643',
 '60644',
 '60645',
 '60646',
 '60647',
 '60649',
 '60651',
 '60652',
 '60653',
 '60654',
 '60655',
 '60656',
 '60657',
 '60659',
 '60660',
 '60661',
 '60664',
 '60666',
 '60668',
 '60669',
 '60670',
 '60673',
 '60674',
 '60675',
 '60677',
 '60678',
 '60680',
 '60681',
 '60682',
 '60684',
 '60685',
 '60686',
 '60687',
 '60688',
 '60689',
 '60690',
 '60691',
 '60693',
 '60694',
 '60695',
 '60696',
 '60697',
 '60699',
 '60701',
 '60706',
 '60707',
 '60803',
 '60804',
 '60805',
 '60827']

In [20]:
len(chicago_zipcode)

90

In [21]:
import random

In [25]:
# retrieve all listings for all chicago zipcodes
all_listing_info = []

for zipcode in chicago_zipcode:
    zipcode_listing = get_all_pages_for_zipcode(zipcode)
    all_listing_info.extend(zipcode_listing)
    print(f"Updated; total number of listings: {len(all_listing_info)}.")
    sleep_time = random.uniform(0, 0.3)
    time.sleep(sleep_time)

Retriving zipcode: 60693, page 1 ...
Results retrieved: 2. Total number of results: 2.
Retriving zipcode: 60693, page 2 ...
Results retrieved: 2. Total number of results: 4.
Retriving zipcode: 60693, page 3 ...
Results retrieved: 12. Total number of results: 16.
Retriving zipcode: 60693, page 4 ...
Results retrieved: 14. Total number of results: 30.
Retriving zipcode: 60693, page 5 ...
Results retrieved: 18. Total number of results: 48.
Retriving zipcode: 60693, page 6 ...
Results retrieved: 18. Total number of results: 66.
Retriving zipcode: 60693, page 7 ...
Results retrieved: 25. Total number of results: 91.
Retriving zipcode: 60693, page 8 ...
Results retrieved: 33. Total number of results: 124.
Retriving zipcode: 60693, page 9 ...
Results retrieved: 19. Total number of results: 143.
Retriving zipcode: 60693, page 10 ...
Results retrieved: 2. Total number of results: 145.
Retriving zipcode: 60693, page 11 ...
Results retrieved: 2. Total number of results: 147.
Retriving zipcode: 60

In [26]:
all_listing_info

[{'zpid': 65552033,
  'longitude': -87.62754,
  'latitude': 41.86114,
  'bathrooms': 1.0,
  'bedrooms': 1.0,
  'address': '1530 S State St APT 415',
  'price': 2800.0,
  'zipcode': '60605',
  'living_area': 1065.0,
  'home_type': 'CONDO',
  'rent_zestimate': 2248,
  'unit': 'Apt 415',
  'time_on_zillow': 5625000},
 {'zpid': 60270579,
  'longitude': -87.64906,
  'latitude': 41.880234,
  'bathrooms': 2.0,
  'bedrooms': 2.0,
  'address': '843 W Monroe St APT 2F',
  'price': 3300.0,
  'zipcode': '60607',
  'living_area': 1000.0,
  'home_type': 'APARTMENT',
  'rent_zestimate': 3657,
  'unit': 'Apt 2F',
  'time_on_zillow': 46056000},
 {'zpid': 446735593,
  'longitude': -87.61685,
  'latitude': 41.88554,
  'bathrooms': 1.0,
  'bedrooms': 1.0,
  'address': '450 E Benton Pl #2308',
  'price': 2534.0,
  'zipcode': '60601',
  'living_area': 690.0,
  'home_type': 'APARTMENT',
  'rent_zestimate': None,
  'unit': '# 2308',
  'time_on_zillow': 90383000},
 {'zpid': 89908245,
  'longitude': -87.64045,


In [27]:
len(all_listing_info)

40303

In [30]:
print(type(all_listing_info))

<class 'list'>


In [32]:
import pandas as pd

In [33]:
df_zillow = pd.DataFrame(all_listing_info)
df_zillow

Unnamed: 0,zpid,longitude,latitude,bathrooms,bedrooms,address,price,zipcode,living_area,home_type,rent_zestimate,unit,time_on_zillow
0,65552033,-87.627540,41.861140,1.0,1.0,1530 S State St APT 415,2800.0,60605,1065.0,CONDO,2248.0,Apt 415,5625000
1,60270579,-87.649060,41.880234,2.0,2.0,843 W Monroe St APT 2F,3300.0,60607,1000.0,APARTMENT,3657.0,Apt 2F,46056000
2,446735593,-87.616850,41.885540,1.0,1.0,450 E Benton Pl #2308,2534.0,60601,690.0,APARTMENT,,# 2308,90383000
3,89908245,-87.640450,41.894466,1.0,1.0,451 W Huron St UNIT 1209,3200.0,60654,850.0,APARTMENT,2643.0,Unit 1209,92258000
4,2112016331,-87.628944,41.874878,1.0,0.0,525 S Dearborn St APT 605,1695.0,60605,550.0,APARTMENT,2752.0,Apt 605,129830000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40298,101462137,-87.654780,41.885292,2.0,2.0,1101 W Lake St STE 3C,4000.0,60607,1431.0,APARTMENT,3865.0,Ste 3C,12837000
40299,60270579,-87.649060,41.880234,2.0,2.0,843 W Monroe St APT 2F,3300.0,60607,1000.0,APARTMENT,3657.0,Apt 2F,48361000
40300,65552033,-87.627540,41.861140,1.0,1.0,1530 S State St APT 415,2800.0,60605,1065.0,CONDO,2248.0,Apt 415,7931000
40301,101462137,-87.654780,41.885292,2.0,2.0,1101 W Lake St STE 3C,4000.0,60607,1431.0,APARTMENT,3865.0,Ste 3C,12838000


In [38]:
df_zillow.shape

(40303, 13)

In [39]:
# save raw data
df_zillow.to_csv("zillow_rental_listings_raw.csv", index=False)

In [42]:
# process duplicates
df_deduped_zillow = df_zillow.drop_duplicates(subset=["zpid"], keep="first")

In [43]:
df_deduped_zillow

Unnamed: 0,zpid,longitude,latitude,bathrooms,bedrooms,address,price,zipcode,living_area,home_type,rent_zestimate,unit,time_on_zillow
0,65552033,-87.627540,41.861140,1.0,1.0,1530 S State St APT 415,2800.0,60605,1065.0,CONDO,2248.0,Apt 415,5625000
1,60270579,-87.649060,41.880234,2.0,2.0,843 W Monroe St APT 2F,3300.0,60607,1000.0,APARTMENT,3657.0,Apt 2F,46056000
2,446735593,-87.616850,41.885540,1.0,1.0,450 E Benton Pl #2308,2534.0,60601,690.0,APARTMENT,,# 2308,90383000
3,89908245,-87.640450,41.894466,1.0,1.0,451 W Huron St UNIT 1209,3200.0,60654,850.0,APARTMENT,2643.0,Unit 1209,92258000
4,2112016331,-87.628944,41.874878,1.0,0.0,525 S Dearborn St APT 605,1695.0,60605,550.0,APARTMENT,2752.0,Apt 605,129830000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36565,2107163413,,,1.0,0.0,(undisclosed Address),1250.0,60660,,APARTMENT,1572.0,,27480451000
36566,345039747,,,1.0,1.0,(undisclosed Address),1350.0,60660,,APARTMENT,,,27550515000
36567,344944878,-87.665470,41.991070,1.0,1.0,6200-48 N Clark St W #1600-24-16554796,1500.0,60660,950.0,APARTMENT,1472.0,# 1600-24-16554796,27757383000
36568,2089847542,-87.658070,41.992477,1.0,2.0,6101 N Winthrop Ave #1,2200.0,60660,850.0,APARTMENT,2208.0,# 1,28091277000


In [44]:
df_deduped_zillow.to_csv("zillow_chicago.csv", index=False)

# Retrieve house selling price data

In [16]:
def get_selling_data_for_zipcode(zipcode: str) -> list:
    '''
    Given a zipcode, this function will retrieve all zillow selling listing data for that zipcode.
    '''
    all_properties = []

    # a list to store all zpids to avoid duplicates
    all_zpid = []

    page = 1
    
    while True:
        params = {
            "location": zipcode,
            "output": "json",
            "status": "forSale",
            "doz": "any",
            "page": str(page)
        }
        
        try:
            if page > 1:
                time.sleep(0.5)
                
            print(f"Retriving zipcode: {zipcode}, page {page} ...")
            response = requests.get(url, headers=headers, params=params)
            response.raise_for_status()
            
            data = response.json()

            results = data.get("results")
            
            # check if there are any results
            if not results:
                print(f"All data by {zipcode} retrieved. Total page number retrieved: {page-1}.")
                break
            
            # extract properties
            for result in results:
                zpid = result.get("zpid")

                # avoid duplicates
                if zpid not in all_zpid:
                    all_zpid.append(zpid)
                    variables = {
                            "zpid": zpid,
                            "longitude": result.get("longitude"),
                            "latitude": result.get("latitude"),
                            "bathrooms": result.get("bathrooms"),
                            "bedrooms": result.get("bedrooms"),
                            "address": result.get("streetAddress"),
                            "zipcode": result.get("zipcode"),
                            "living_area": result.get("livingArea"),
                            "lot_area": result.get("lotAreaValue"),
                            "lot_area_unit": result.get("lotAreaUnit"),
                            "home_type": result.get("homeType"),
                            "price": result.get("price"),
                            "rent_zestimate": result.get("rentZestimate"),
                            "sale_zestimate": result.get("zestimate"),
                            "time_on_zillow": result.get("timeOnZillow")
                        }
                    all_properties.append(variables)
    

            print(f"Results retrieved: {len(results)}. Total number of results: {len(all_properties)}.")
            
            page += 1
            
        except requests.exceptions.RequestException as e:
            print(f"Cannot retrieve data: {e}")
            break
            
        except Exception as e:
            print(f"Error processing data: {e}")
            break
    
    return all_properties

In [17]:
# test the function
test_sale_listings = get_selling_data_for_zipcode("60610")

Retriving zipcode: 60610, page 1 ...
Results retrieved: 41. Total number of results: 41.
Retriving zipcode: 60610, page 2 ...
Results retrieved: 41. Total number of results: 82.
Retriving zipcode: 60610, page 3 ...
Results retrieved: 41. Total number of results: 123.
Retriving zipcode: 60610, page 4 ...
Results retrieved: 41. Total number of results: 164.
Retriving zipcode: 60610, page 5 ...
Results retrieved: 29. Total number of results: 193.
Retriving zipcode: 60610, page 6 ...
Results retrieved: 41. Total number of results: 193.
Retriving zipcode: 60610, page 7 ...
Results retrieved: 41. Total number of results: 193.
Retriving zipcode: 60610, page 8 ...
Results retrieved: 41. Total number of results: 193.
Retriving zipcode: 60610, page 9 ...
Results retrieved: 41. Total number of results: 193.
Retriving zipcode: 60610, page 10 ...
Results retrieved: 41. Total number of results: 193.
Retriving zipcode: 60610, page 11 ...
Results retrieved: 41. Total number of results: 193.
Retriving 

In [9]:
test_sale_listings

[{'zpid': 89896012,
  'longitude': -87.643326,
  'latitude': 41.89839,
  'bathrooms': 1.0,
  'bedrooms': 1.0,
  'address': '845 N Kingsbury St UNIT 313',
  'zipcode': '60610',
  'living_area': 909.0,
  'lot_area': None,
  'lot_area_unit': None,
  'home_type': 'CONDO',
  'price': 345000.0,
  'rent_zestimate': None,
  'sale_zestimate': None,
  'time_on_zillow': 21115000},
 {'zpid': 80852891,
  'longitude': -87.62737,
  'latitude': 41.908016,
  'bathrooms': 7.0,
  'bedrooms': 5.0,
  'address': '1401 N Astor St',
  'zipcode': '60610',
  'living_area': 5500.0,
  'lot_area': 2835.756,
  'lot_area_unit': 'sqft',
  'home_type': 'SINGLE_FAMILY',
  'price': 3395000.0,
  'rent_zestimate': 8089,
  'sale_zestimate': 3163900,
  'time_on_zillow': 38504000},
 {'zpid': 2075494749,
  'longitude': -87.63124,
  'latitude': 41.909683,
  'bathrooms': 1.0,
  'bedrooms': 1.0,
  'address': '70 W Burton Pl APT 1801',
  'zipcode': '60610',
  'living_area': 745.0,
  'lot_area': None,
  'lot_area_unit': None,
  'h

In [22]:
all_sales_listing_info = []

for zipcode in chicago_zipcode:
    zipcode_listing = get_selling_data_for_zipcode(zipcode)
    all_sales_listing_info.extend(zipcode_listing)
    print(f"Updated; total number of listings: {len(all_sales_listing_info)}.")
    sleep_time = random.uniform(0, 0.3)
    time.sleep(sleep_time)

Retriving zipcode: 60693, page 1 ...
Results retrieved: 41. Total number of results: 41.
Retriving zipcode: 60693, page 2 ...
Results retrieved: 41. Total number of results: 82.
Retriving zipcode: 60693, page 3 ...
Results retrieved: 41. Total number of results: 123.
Retriving zipcode: 60693, page 4 ...
Results retrieved: 41. Total number of results: 164.
Retriving zipcode: 60693, page 5 ...
Results retrieved: 41. Total number of results: 205.
Retriving zipcode: 60693, page 6 ...
Results retrieved: 41. Total number of results: 246.
Retriving zipcode: 60693, page 7 ...
Results retrieved: 41. Total number of results: 287.
Retriving zipcode: 60693, page 8 ...
Results retrieved: 41. Total number of results: 328.
Retriving zipcode: 60693, page 9 ...
Results retrieved: 41. Total number of results: 369.
Retriving zipcode: 60693, page 10 ...
Results retrieved: 41. Total number of results: 410.
Retriving zipcode: 60693, page 11 ...
Results retrieved: 41. Total number of results: 451.
Retriving 

In [23]:
all_sales_listing_info

[{'zpid': 80849564,
  'longitude': -87.62102,
  'latitude': 41.88652,
  'bathrooms': 1.0,
  'bedrooms': 1.0,
  'address': '222 N Columbus Dr APT 1901',
  'zipcode': '60601',
  'living_area': 835.0,
  'lot_area': None,
  'lot_area_unit': None,
  'home_type': 'CONDO',
  'price': 300000.0,
  'rent_zestimate': None,
  'sale_zestimate': None,
  'time_on_zillow': 6551000},
 {'zpid': 3876625,
  'longitude': -87.62145,
  'latitude': 41.862095,
  'bathrooms': 3.0,
  'bedrooms': 3.0,
  'address': '1502 S Prairie Ave UNIT A',
  'zipcode': '60605',
  'living_area': 2100.0,
  'lot_area': 0.0,
  'lot_area_unit': 'sqft',
  'home_type': 'TOWNHOUSE',
  'price': 650000.0,
  'rent_zestimate': None,
  'sale_zestimate': None,
  'time_on_zillow': 20379000},
 {'zpid': 80834255,
  'longitude': -87.62742,
  'latitude': 41.89358,
  'bathrooms': 1.0,
  'bedrooms': 1.0,
  'address': '10 E Ontario St APT 1201',
  'zipcode': '60611',
  'living_area': 765.0,
  'lot_area': None,
  'lot_area_unit': None,
  'home_type'

In [24]:
df_sales = pd.DataFrame(all_sales_listing_info)
df_sales.head()

Unnamed: 0,zpid,longitude,latitude,bathrooms,bedrooms,address,zipcode,living_area,lot_area,lot_area_unit,home_type,price,rent_zestimate,sale_zestimate,time_on_zillow
0,80849564,-87.62102,41.88652,1.0,1.0,222 N Columbus Dr APT 1901,60601,835.0,,,CONDO,300000.0,,,6551000
1,3876625,-87.62145,41.862095,3.0,3.0,1502 S Prairie Ave UNIT A,60605,2100.0,0.0,sqft,TOWNHOUSE,650000.0,,,20379000
2,80834255,-87.62742,41.89358,1.0,1.0,10 E Ontario St APT 1201,60611,765.0,,,CONDO,235000.0,,,22474000
3,101346658,-87.61461,41.892853,4.0,3.0,600 N Lake Shore Dr APT 3705,60611,2540.0,,,CONDO,1950000.0,7466.0,1805200.0,28175000
4,113958256,-87.62642,41.888954,3.0,2.0,401 N Wabash Ave #60F,60601,1837.0,,,CONDO,1495000.0,,,30543000


In [28]:
df_sales.to_csv("zillow_chicago_forsale_raw.csv", index=False)

In [25]:
df_sales.shape

(8680, 15)

In [27]:
df_sales_deduped = df_sales.drop_duplicates(subset=["zpid"], keep="first")
df_sales_deduped.shape

(4721, 15)

In [29]:
df_sales_deduped.to_csv("zillow_chicago_forsale.csv", index=False)