In [1]:
import requests 
import json 
import pandas as pd 

In [2]:
def send_request(page_number: int, offset_parameter: int):
    url = "https://www.realtor.com/api/v1/hulk?client_id=rdc-x&schema=vesta"
    headers = {"content-type": "application/json"}

    body = r'{"query":"\n\nquery ConsumerSearchMainQuery($query: HomeSearchCriteria!, $limit: Int, $offset: Int, $sort: [SearchAPISort], $sort_type: SearchSortType, $client_data: JSON, $geoSupportedSlug: String!, $bucket: SearchAPIBucket, $by_prop_type: [String])\n{\n  home_search: home_search(query: $query,\n    sort: $sort,\n    limit: $limit,\n    offset: $offset,\n    sort_type: $sort_type,\n    client_data: $client_data,\n    bucket: $bucket,\n  ){\n    count\n    total\n    results {\n      property_id\n      list_price\n      primary_photo (https: true){\n        href\n      }\n      source {\n        id\n        agents{\n          office_name\n        }\n        type\n        spec_id\n        plan_id\n      }\n      community {\n        property_id\n        description {\n          name\n        }\n        advertisers{\n          office{\n            hours\n            phones {\n              type\n              number\n            }\n          }\n          builder {\n            fulfillment_id\n          }\n        }\n      }\n      products {\n        brand_name\n        products\n      }\n      listing_id\n      matterport\n      virtual_tours{\n        href\n        type\n      }\n      status\n      permalink\n      price_reduced_amount\n      other_listings{rdc {\n      listing_id\n      status\n      listing_key\n      primary\n    }}\n      description{\n        beds\n        baths\n        baths_full\n        baths_half\n        baths_1qtr\n        baths_3qtr\n        garage\n        stories\n        type\n        sub_type\n        lot_sqft\n        sqft\n        year_built\n        sold_price\n        sold_date\n        name\n      }\n      location{\n        street_view_url\n        address{\n          line\n          postal_code\n          state\n          state_code\n          city\n          coordinate {\n            lat\n            lon\n          }\n        }\n        county {\n          name\n          fips_code\n        }\n      }\n      tax_record {\n        public_record_id\n      }\n      lead_attributes {\n        show_contact_an_agent\n        opcity_lead_attributes {\n          cashback_enabled\n          flip_the_market_enabled\n        }\n        lead_type\n      }\n      open_houses {\n        start_date\n        end_date\n        description\n        methods\n        time_zone\n        dst\n      }\n      flags{\n        is_coming_soon\n        is_pending\n        is_foreclosure\n        is_contingent\n        is_new_construction\n        is_new_listing (days: 14)\n        is_price_reduced (days: 30)\n        is_plan\n        is_subdivision\n      }\n      list_date\n      last_update_date\n      coming_soon_date\n      photos(limit: 2, https: true){\n        href\n      }\n      tags\n      branding {\n        type\n        photo\n        name\n      }\n    }\n  }\n  geo(slug_id: $geoSupportedSlug) {\n    parents {\n      geo_type\n      slug_id\n      name\n    }\n    geo_statistics(group_by: property_type) {\n      housing_market {\n        by_prop_type(type: $by_prop_type){\n          type\n           attributes{\n            median_listing_price\n            median_lot_size\n            median_sold_price\n            median_price_per_sqft\n            median_days_on_market\n          }\n        }\n        listing_count\n        median_listing_price\n        median_rent_price\n        median_price_per_sqft\n        median_days_on_market\n        median_sold_price\n        month_to_month {\n          active_listing_count_percent_change\n          median_days_on_market_percent_change\n          median_listing_price_percent_change\n          median_listing_price_sqft_percent_change\n        }\n      }\n    }\n    recommended_cities: recommended(query: {geo_search_type: city, limit: 20}) {\n      geos {\n        ... on City {\n          city\n          state_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n    recommended_neighborhoods: recommended(query: {geo_search_type: neighborhood, limit: 20}) {\n      geos {\n        ... on Neighborhood {\n          neighborhood\n          city\n          state_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n    recommended_counties: recommended(query: {geo_search_type: county, limit: 20}) {\n      geos {\n        ... on HomeCounty {\n          county\n          state_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n    recommended_zips: recommended(query: {geo_search_type: postal_code, limit: 20}) {\n      geos {\n        ... on PostalCode {\n          postal_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n  }\n}","variables":{"query":{"status":["for_sale","ready_to_build"],"primary":true,"state_code":"NY"},"client_data":{"device_data":{"device_type":"web"},"user_data":{"last_view_timestamp":-1}},"limit":42,"offset":42,"zohoQuery":{"silo":"search_result_page","location":"New York","property_status":"for_sale","filters":{},"page_index":"2"},"sort_type":"relevant","geoSupportedSlug":"","by_prop_type":["home"]},"operationName":"ConsumerSearchMainQuery","callfrom":"SRP","nrQueryType":"MAIN_SRP","visitor_id":"eff16470-ceb5-4926-8c0b-6d1779772842","isClient":true,"seoPayload":{"asPath":"/realestateandhomes-search/New-York/pg-2","pageType":{"silo":"search_result_page","status":"for_sale"},"county_needed_for_uniq":false}}'
    json_body = json.loads(body)

    json_body["variables"]["page_index"] = page_number
    json_body["seoPayload"] = page_number
    json_body["variables"]["offset"] = offset_parameter

    r = requests.post(url=url, json=json_body, headers=headers)
    json_data = r.json()
    return json_data

In [3]:
offset_parameter = 0

json_data_list = []

for page_number in range(1, 207):
    json_data = send_request(page_number=page_number, offset_parameter=offset_parameter)
    json_data_list.append(json_data)
    offset_parameter +=42

In [4]:
def extract_features(entry: dict):
    feature_dict = {
        "id": entry["property_id"],
        "price": entry["list_price"],
        "beds": entry["description"]["beds"],
        "baths": entry["description"]["baths"],
        "garage": entry["description"]["garage"],
        "stories": entry["description"]["stories"],
        "house_type": entry["description"]["type"],
        "lot_sqft": entry["description"]["lot_sqft"],
        "sqft": entry["description"]["sqft"],
        "year_built": entry["description"]["year_built"],
        "address": entry["location"]["address"]["line"],
        "postal_code": entry["location"]["address"]["postal_code"],
        "state": entry["location"]["address"]["state_code"],
        "city": entry["location"]["address"]["city"],
    }
    
    if entry["location"]["address"]["coordinate"]:
        feature_dict.update({"lat": entry["location"]["address"]["coordinate"]["lat"]})
        feature_dict.update({"lon": entry["location"]["address"]["coordinate"]["lon"]})
    if entry["location"]["county"]:
        feature_dict.update({"county": entry["location"]["county"]["name"]})
    
    return feature_dict

In [5]:
feature_dict_list = []

for data in json_data_list:
    for entry in data["data"]["home_search"]["results"]:
        feature_dict = extract_features(entry=entry)
        feature_dict_list.append(feature_dict)

df = pd.DataFrame(feature_dict_list)


Unnamed: 0,id,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,postal_code,state,city,lat,lon,county
0,3106570269,139900,3.0,2.0,,,single_family,12632.0,1190.0,1920.0,154 Maple Ave,12053,NY,Delanson,42.734360,-74.185005,Schenectady
1,3887939011,395000,4.0,3.0,2.0,2.0,single_family,30056.0,2987.0,1982.0,1169 Hidden Valley Trl,14580,NY,Webster,43.243062,-77.440707,Monroe
2,4954163177,185000,4.0,2.0,1.0,1.0,single_family,7501.0,1863.0,1965.0,7869 Oneida Trl,13030,NY,Bridgeport,43.163020,-75.982109,Onondaga
3,3348454727,309900,3.0,2.0,3.0,,single_family,5227.0,2080.0,1929.0,44 Van Schoick Ave,12208,NY,Albany,42.653406,-73.798521,Albany
4,4495247096,440000,4.0,3.0,2.0,2.0,single_family,17860.0,1940.0,1965.0,16 Brookland Farms Rd,12601,NY,Poughkeepsie,41.635654,-73.910101,Dutchess
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8647,3698584093,1049000,6.0,2.0,1.0,,multi_family,2896.0,,1920.0,74-15 88th Ave,11421,NY,Woodhaven,40.689428,-73.867066,Queens
8648,9761946482,1495000,1.0,1.0,,16.0,coop,,,1925.0,45 5th Ave Apt 17C,10003,NY,New York City,40.734161,-73.994557,New York
8649,3490807723,1395000,3.0,1.0,,5.0,condos,,,1890.0,705 Carroll St Apt 4R,11215,NY,New York City,40.673940,-73.977606,Kings
8650,4568201155,4500000,6.0,4.0,,4.0,townhomes,,,,608 3rd St,11215,NY,New York City,40.668649,-73.975270,Kings


Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,464900,4.0,3.0,2.0,2.0,single_family,387684.0,2092.0,1992.0,1865 Albany Rd,NY,Frankfort,Herkimer
1,189000,4.0,2.0,2.0,2.0,single_family,46304.0,1696.0,1903.0,3802 Piffard Cir E,NY,Piffard,Livingston
2,324800,3.0,2.0,2.0,,single_family,44431.0,1758.0,1987.0,318 Carrolls Grove Rd,NY,Troy,Rensselaer
3,263000,3.0,2.0,,,single_family,3049.0,1728.0,1904.0,74 Cherry St,NY,Glens Falls,Warren
4,295900,2.0,1.0,1.0,1.0,single_family,18774.0,896.0,1960.0,10511 Fanta Ln Unit 57B,NY,Lyndonville,Orleans
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8647,998000,3.0,2.0,1.0,2.0,single_family,6000.0,1748.0,1940.0,80-15 236th St,NY,Queens Village,Queens
8648,1950000,,4.0,,,multi_family,1650.0,2970.0,,246 Lexington Ave,NY,Brooklyn,Kings
8649,1900000,4.0,3.0,2.0,3.0,single_family,22216.0,2858.0,1953.0,124 Woodhill Ln,NY,Manhasset,Nassau
8650,438400,0.0,1.0,,8.0,condos,,410.0,2019.0,567 Ocean Ave Apt A504,NY,New York City,Kings


In [114]:
#cheking if there are missing values
df.isnull().sum()




price            0
beds           148
baths           94
garage        4189
stories       2882
house_type       0
lot_sqft      1872
sqft          1885
year_built     291
address         19
state            0
city             4
county          20
dtype: int64

In [115]:
df=df.dropna(subset=['beds', 'baths', 'sqft','lot_sqft', 'address', 'city', 'county'])
df['year_built'] = df['year_built'].fillna(df['year_built'].median())
df['garage'] = df['garage'].fillna(0)
df['stories'] = df['stories'].fillna(0)
df = df[df.beds != 0]
df = df[df.baths != 0]




df.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year_built'] = df['year_built'].fillna(df['year_built'].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['garage'] = df['garage'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stories'] = df['stories'].fillna(0)


price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

In [116]:
df.shape

(5401, 13)

In [117]:
# sorting data by Price 
df=df.sort_values(['price'], ascending = [False]) 
df.reset_index(drop=True, inplace=True)
df





Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,31500000,5.0,6.0,0.0,0.0,single_family,240887.0,6188.0,1975.0,359 Meadow Ln,NY,Southampton,Suffolk
1,26250000,3.0,5.0,0.0,96.0,condos,34470.0,4019.0,2015.0,432 Park Ave Apt 62B,NY,New York City,New York
2,25000000,5.0,6.0,0.0,25.0,condos,4937.0,4089.0,2022.0,555 W 22nd St Unit 11AW,NY,New York City,New York
3,25000000,5.0,8.0,0.0,17.0,coop,11715.0,7000.0,1916.0,550 Park Ave Unit 10TH,NY,New York City,New York
4,24950000,9.0,12.0,5.0,0.0,single_family,130680.0,15200.0,1920.0,275 Ox Pasture Rd,NY,Southampton,Suffolk
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5396,19900,1.0,1.0,0.0,1.0,single_family,2801.0,374.0,1935.0,4211 State Route 3,NY,Fine,St. Lawrence
5397,19900,3.0,1.0,0.0,0.0,single_family,3485.0,700.0,1980.0,483 1st Ave,NY,Troy,Rensselaer
5398,19900,3.0,2.0,1.0,2.0,single_family,3485.0,1200.0,1880.0,4213 State Highway 3,NY,Star Lake,St. Lawrence
5399,10000,2.0,1.0,1.0,1.0,mobile,39639600.0,910.0,1974.0,20 Quarry Hill Est,NY,Akron,Erie


In [118]:
df = df.reset_index(drop=True)
df.dtypes

price           int64
beds          float64
baths         float64
garage        float64
stories       float64
house_type     object
lot_sqft      float64
sqft          float64
year_built    float64
address        object
state          object
city           object
county         object
dtype: object

In [119]:
#cheking if we have duplicated data
print("There is {} duplicated values in data frame".format(df.duplicated().sum()))



There is 5 duplicated values in data frame


In [120]:
#deleiting duplicated values
df.drop_duplicates(keep=False,inplace=True)

print("There is {} duplicated values in data frame".format(df.duplicated().sum()))


There is 0 duplicated values in data frame


In [121]:
#saving to csv file
df.to_csv('RealEstateNewYork.csv', index=False)
