In [None]:
 ================================================================
                         DATA GENERATION 
 =================================================================

In [10]:
import requests
import json
import os
from dotenv import load_dotenv
from datetime import datetime
import pandas as pd

In [11]:
# Load environment variables
load_dotenv()

# Get the directory where this particular script is located
BASE_DIR = os.path.dirname(os.path.abspath("extract.ipynb"))

# Creation of environment variables to protect sensitive data
url = os.getenv("url")
API_KEY = os.getenv("API_KEY")
API_HOST = os.getenv("API_HOST")

In [12]:
# Calling API
url = url

querystring = {"location":"Houston, TX",
               "status":"forSale",
               "sort":"relevance",
               "sortType":"asc",
               "priceType":"listPrice",
               "listingType":"agent"}

headers = {
	"x-rapidapi-key": API_KEY,
	"x-rapidapi-host": API_HOST
}

In [13]:
# DATA COLLECTION

# make request
response = requests.get(url, headers=headers, params=querystring)

# error handling using try and except
try:
    data = response.json()
    print("SUCCESS! - Data has been pulled from Zillow API (page 1)")
except ValueError:
    print("Error: non JSON received")
    print(response.text)
    exit(1)

SUCCESS! - Data has been pulled from Zillow API (page 1)


In [14]:
# Get total pages from the metadata
all_records = []
total_pages_to_extract = 5

# Add records from page 1
if "data" in data:
    all_records.extend(data["data"])
else:
    print("Page 1 returned no 'data' field. Full response:")
    print(data)
    exit(1)

# Loop through page 2 -10 and extract just 20 pages
for page in range(2, total_pages_to_extract + 1):
    querystring['page'] = page
    response = requests.get(url, headers=headers, params=querystring)

    try:
        page_data = response.json()
    except ValueError:
       print(f"Page {page} returned invalid JSON. Skipping.")
       continue

 # Check if “data” exists
    if "data" not in page_data:
        print(f"\n⚠️ PAGE {page} returned no data. Response:")
        print(page_data)
        continue

    all_records.extend(page_data["data"])
    print(f"Page {page} collected successfully.")

print(f"\nTOTAL RECORDS COLLECTED: {len(all_records)} from {total_pages_to_extract} pages")

Page 1 returned no 'data' field. Full response:
{'message': 'You have exceeded the MONTHLY quota for Requests on your current plan, BASIC. Upgrade your plan at https://rapidapi.com/ntd119/api/zillow-com4'}

⚠️ PAGE 2 returned no data. Response:
{'message': 'You have exceeded the MONTHLY quota for Requests on your current plan, BASIC. Upgrade your plan at https://rapidapi.com/ntd119/api/zillow-com4'}

⚠️ PAGE 3 returned no data. Response:
{'message': 'You have exceeded the MONTHLY quota for Requests on your current plan, BASIC. Upgrade your plan at https://rapidapi.com/ntd119/api/zillow-com4'}

⚠️ PAGE 4 returned no data. Response:
{'message': 'You have exceeded the MONTHLY quota for Requests on your current plan, BASIC. Upgrade your plan at https://rapidapi.com/ntd119/api/zillow-com4'}

⚠️ PAGE 5 returned no data. Response:
{'message': 'You have exceeded the MONTHLY quota for Requests on your current plan, BASIC. Upgrade your plan at https://rapidapi.com/ntd119/api/zillow-com4'}

TOTAL

In [None]:
# ===================================================================
#                           DATA STORAGE
# ===================================================================

In [None]:
# Creating a storage folder in local machine
today_date = datetime.now().strftime("%Y-%m-%d")  # Getting today's date
folder_path = os.path.join(BASE_DIR, "data", "raw_data") # File path creation
os.makedirs(folder_path, exist_ok=True)

# Creating the file path 
file_path = f"{folder_path}/{today_date}.json"

# Dumping into the filepath the API response
with open(file_path, "w") as file:
    json.dump(all_records, file, indent=4)