In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import random
from random import randint

In [2]:
zipcode_dict = dict() #dictionary of Zip Codes by Neighborhood in Manhattan
# Reference: https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm
zipcode_dict['Chelsea and Clinton'] = ['10001', '10011', '10018', '10019', '10020', '10036']
zipcode_dict['Central Harlem'] = ['10026', '10027', '10030', '10037', '10039']
zipcode_dict['East Harlem'] = ['10029', '10035']
zipcode_dict['Gramercy Park and Murray Hill'] = ['10010', '10016', '10017', '10022']
zipcode_dict['Greenwich Village and Soho'] = ['10012', '10013', '10014']
zipcode_dict['Lower Manhattan'] =['10004', '10005', '10006', '10007', '10038', '10280']
zipcode_dict['Lower East Side'] =['10002', '10003', '10009']
zipcode_dict['Upper East Side'] =['10021', '10028', '10044', '10065', '10075', '10128']
zipcode_dict['Upper West Side'] =['10023', '10024', '10025']
zipcode_dict['Inwood and Washington Heights'] =['10031', '10032', '10033', '10034', '10040']
zipcode_dict['Roosevelt Island'] =['10044']

In [3]:
def get_links(zipcode_list): # get link of each home on the Zillow market
    link_list = list()
    req_headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.8',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }
    
    with requests.Session() as s:
        for zipcode in zipcode_list:     
            for i in range(1, 10):
                try:
                    page_url = 'https://www.zillow.com/homes/for_sale/'+zipcode+'_rb/'+str(i)+'_p/'
                    print(page_url)
                    r = s.get(page_url, headers=req_headers)
                    soup = BeautifulSoup(r.content, 'lxml')
                    tags = soup.find_all('div', {'class': 'zsg-photo-card-content zsg-aspect-ratio-content'})
                    for tag in tags:
                        link_list.append(tag.find('a').get('href')) 
                except:
                    # no next page
                    break
        time.sleep(5)
    return link_list

**Need to scrape each neighborhood separately, using for loop would experience IP ban even if we use time.sleep()**

In [29]:
links = get_links(zipcode_dict['Chelsea and Clinton'])[0]

https://www.zillow.com/homes/for_sale/10001_rb/1_p/
https://www.zillow.com/homes/for_sale/10001_rb/2_p/
https://www.zillow.com/homes/for_sale/10001_rb/3_p/
https://www.zillow.com/homes/for_sale/10001_rb/4_p/
https://www.zillow.com/homes/for_sale/10001_rb/5_p/
https://www.zillow.com/homes/for_sale/10001_rb/6_p/
https://www.zillow.com/homes/for_sale/10001_rb/7_p/
https://www.zillow.com/homes/for_sale/10001_rb/8_p/
https://www.zillow.com/homes/for_sale/10001_rb/9_p/
https://www.zillow.com/homes/for_sale/10011_rb/1_p/
https://www.zillow.com/homes/for_sale/10011_rb/2_p/
https://www.zillow.com/homes/for_sale/10011_rb/3_p/
https://www.zillow.com/homes/for_sale/10011_rb/4_p/
https://www.zillow.com/homes/for_sale/10011_rb/5_p/
https://www.zillow.com/homes/for_sale/10011_rb/6_p/
https://www.zillow.com/homes/for_sale/10011_rb/7_p/
https://www.zillow.com/homes/for_sale/10011_rb/8_p/
https://www.zillow.com/homes/for_sale/10011_rb/9_p/
https://www.zillow.com/homes/for_sale/10018_rb/1_p/
https://www.

In [30]:
links[]

['/homedetails/520-W-28th-St-New-York-NY-10001/2086472572_zpid/',
 '/homedetails/536-W-29th-St-New-York-NY-10001/2086334517_zpid/',
 '/homedetails/11-W-30th-St-14FL-New-York-NY-10001/2086521150_zpid/',
 '/homedetails/522-W-29th-St-9A-New-York-NY-10001/2096425505_zpid/',
 '/homedetails/1200-Broadway-PH-E-New-York-NY-10001/2087023103_zpid/',
 '/homedetails/35-Hudson-Yards-8601-New-York-NY-10001/2085060290_zpid/',
 '/homedetails/284-5th-Ave-3E-New-York-NY-10001/123762025_zpid/',
 '/homedetails/520-W-28th-St-PH37-New-York-NY-10001/2098508808_zpid/',
 '/homedetails/252-W-30th-St-APT-10A-New-York-NY-10001/31503372_zpid/',
 '/homedetails/550-W-29th-St-PHC-New-York-NY-10001/2086457634_zpid/',
 '/homedetails/110-W-26th-St-New-York-NY-10001/2106473188_zpid/',
 '/homedetails/448-W-25th-St-New-York-NY-10001/31502269_zpid/',
 '/homedetails/522-W-29th-St-10B-New-York-NY-10001/2106456199_zpid/',
 '/homedetails/502-W-27th-St-New-York-NY-10001/31502012_zpid/',
 '/homedetails/315-W-29th-St-New-York-NY-1

In [32]:
df = pd.DataFrame(links)
df = df.drop_duplicates()
df['region'] = 'Chelsea and Clinton'

In [34]:
df = df.drop(columns=['zip_code'])

In [35]:
df.head()

Unnamed: 0,0,region
0,/homedetails/520-W-28th-St-New-York-NY-10001/2...,Chelsea and Clinton
1,/homedetails/536-W-29th-St-New-York-NY-10001/2...,Chelsea and Clinton
2,/homedetails/11-W-30th-St-14FL-New-York-NY-100...,Chelsea and Clinton
3,/homedetails/522-W-29th-St-9A-New-York-NY-1000...,Chelsea and Clinton
4,/homedetails/1200-Broadway-PH-E-New-York-NY-10...,Chelsea and Clinton


In [36]:
df.to_csv('Chelsea and Clinton.csv')

In [4]:
links = get_links(zipcode_dict['Upper West Side'])

https://www.zillow.com/homes/for_sale/10023_rb/1_p/
https://www.zillow.com/homes/for_sale/10023_rb/2_p/
https://www.zillow.com/homes/for_sale/10023_rb/3_p/
https://www.zillow.com/homes/for_sale/10023_rb/4_p/
https://www.zillow.com/homes/for_sale/10023_rb/5_p/
https://www.zillow.com/homes/for_sale/10023_rb/6_p/
https://www.zillow.com/homes/for_sale/10023_rb/7_p/
https://www.zillow.com/homes/for_sale/10023_rb/8_p/
https://www.zillow.com/homes/for_sale/10023_rb/9_p/
https://www.zillow.com/homes/for_sale/10024_rb/1_p/
https://www.zillow.com/homes/for_sale/10024_rb/2_p/
https://www.zillow.com/homes/for_sale/10024_rb/3_p/
https://www.zillow.com/homes/for_sale/10024_rb/4_p/
https://www.zillow.com/homes/for_sale/10024_rb/5_p/
https://www.zillow.com/homes/for_sale/10024_rb/6_p/
https://www.zillow.com/homes/for_sale/10024_rb/7_p/
https://www.zillow.com/homes/for_sale/10024_rb/8_p/
https://www.zillow.com/homes/for_sale/10024_rb/9_p/
https://www.zillow.com/homes/for_sale/10025_rb/1_p/
https://www.

In [6]:
df_2 = pd.DataFrame(links)
df_2 = df_2.drop_duplicates()
df_2['region'] = 'Upper West Side'
df_2.columns = ['link','region']
df_2.to_csv('Upper West Side.csv')

In [7]:
links = get_links(zipcode_dict['Upper East Side'])

https://www.zillow.com/homes/for_sale/10021_rb/1_p/
https://www.zillow.com/homes/for_sale/10021_rb/2_p/
https://www.zillow.com/homes/for_sale/10021_rb/3_p/
https://www.zillow.com/homes/for_sale/10021_rb/4_p/
https://www.zillow.com/homes/for_sale/10021_rb/5_p/
https://www.zillow.com/homes/for_sale/10021_rb/6_p/
https://www.zillow.com/homes/for_sale/10021_rb/7_p/
https://www.zillow.com/homes/for_sale/10021_rb/8_p/
https://www.zillow.com/homes/for_sale/10021_rb/9_p/
https://www.zillow.com/homes/for_sale/10028_rb/1_p/
https://www.zillow.com/homes/for_sale/10028_rb/2_p/
https://www.zillow.com/homes/for_sale/10028_rb/3_p/
https://www.zillow.com/homes/for_sale/10028_rb/4_p/
https://www.zillow.com/homes/for_sale/10028_rb/5_p/
https://www.zillow.com/homes/for_sale/10028_rb/6_p/
https://www.zillow.com/homes/for_sale/10028_rb/7_p/
https://www.zillow.com/homes/for_sale/10028_rb/8_p/
https://www.zillow.com/homes/for_sale/10028_rb/9_p/
https://www.zillow.com/homes/for_sale/10044_rb/1_p/
https://www.

In [8]:
df_3 = pd.DataFrame(links)
df_3 = df_3.drop_duplicates()
df_3['region'] = 'Upper East Side'
df_3.columns = ['link','region']
df_3.to_csv('Upper East Side.csv')

In [17]:
links = get_links(zipcode_dict['Lower East Side'])

https://www.zillow.com/homes/for_sale/10002_rb/1_p/
https://www.zillow.com/homes/for_sale/10002_rb/2_p/
https://www.zillow.com/homes/for_sale/10002_rb/3_p/
https://www.zillow.com/homes/for_sale/10002_rb/4_p/
https://www.zillow.com/homes/for_sale/10002_rb/5_p/
https://www.zillow.com/homes/for_sale/10002_rb/6_p/
https://www.zillow.com/homes/for_sale/10002_rb/7_p/
https://www.zillow.com/homes/for_sale/10002_rb/8_p/
https://www.zillow.com/homes/for_sale/10002_rb/9_p/
https://www.zillow.com/homes/for_sale/10003_rb/1_p/
https://www.zillow.com/homes/for_sale/10003_rb/2_p/
https://www.zillow.com/homes/for_sale/10003_rb/3_p/
https://www.zillow.com/homes/for_sale/10003_rb/4_p/
https://www.zillow.com/homes/for_sale/10003_rb/5_p/
https://www.zillow.com/homes/for_sale/10003_rb/6_p/
https://www.zillow.com/homes/for_sale/10003_rb/7_p/
https://www.zillow.com/homes/for_sale/10003_rb/8_p/
https://www.zillow.com/homes/for_sale/10003_rb/9_p/
https://www.zillow.com/homes/for_sale/10009_rb/1_p/
https://www.

In [19]:
df_4 = pd.DataFrame(links)
df_4 = df_4.drop_duplicates()
df_4['region'] = 'Lower East Side'
df_4.columns = ['link','region']
df_4.to_csv('Lower East Side.csv')

In [20]:
links = get_links(zipcode_dict['Greenwich Village and Soho'] )

https://www.zillow.com/homes/for_sale/10012_rb/1_p/
https://www.zillow.com/homes/for_sale/10012_rb/2_p/
https://www.zillow.com/homes/for_sale/10012_rb/3_p/
https://www.zillow.com/homes/for_sale/10012_rb/4_p/
https://www.zillow.com/homes/for_sale/10012_rb/5_p/
https://www.zillow.com/homes/for_sale/10012_rb/6_p/
https://www.zillow.com/homes/for_sale/10012_rb/7_p/
https://www.zillow.com/homes/for_sale/10012_rb/8_p/
https://www.zillow.com/homes/for_sale/10012_rb/9_p/
https://www.zillow.com/homes/for_sale/10013_rb/1_p/
https://www.zillow.com/homes/for_sale/10013_rb/2_p/
https://www.zillow.com/homes/for_sale/10013_rb/3_p/
https://www.zillow.com/homes/for_sale/10013_rb/4_p/
https://www.zillow.com/homes/for_sale/10013_rb/5_p/
https://www.zillow.com/homes/for_sale/10013_rb/6_p/
https://www.zillow.com/homes/for_sale/10013_rb/7_p/
https://www.zillow.com/homes/for_sale/10013_rb/8_p/
https://www.zillow.com/homes/for_sale/10013_rb/9_p/
https://www.zillow.com/homes/for_sale/10014_rb/1_p/
https://www.

In [22]:
df_5 = pd.DataFrame(links)
df_5 = df_5.drop_duplicates()
df_5['region'] = 'Greenwich Village and Soho'
df_5.columns = ['link','region']
df_5.to_csv('Greenwich Village and Soho.csv')

In [23]:
links = get_links(zipcode_dict['Gramercy Park and Murray Hill'] )

https://www.zillow.com/homes/for_sale/10010_rb/1_p/
https://www.zillow.com/homes/for_sale/10010_rb/2_p/
https://www.zillow.com/homes/for_sale/10010_rb/3_p/
https://www.zillow.com/homes/for_sale/10010_rb/4_p/
https://www.zillow.com/homes/for_sale/10010_rb/5_p/
https://www.zillow.com/homes/for_sale/10010_rb/6_p/
https://www.zillow.com/homes/for_sale/10010_rb/7_p/
https://www.zillow.com/homes/for_sale/10010_rb/8_p/
https://www.zillow.com/homes/for_sale/10010_rb/9_p/
https://www.zillow.com/homes/for_sale/10016_rb/1_p/
https://www.zillow.com/homes/for_sale/10016_rb/2_p/
https://www.zillow.com/homes/for_sale/10016_rb/3_p/
https://www.zillow.com/homes/for_sale/10016_rb/4_p/
https://www.zillow.com/homes/for_sale/10016_rb/5_p/
https://www.zillow.com/homes/for_sale/10016_rb/6_p/
https://www.zillow.com/homes/for_sale/10016_rb/7_p/
https://www.zillow.com/homes/for_sale/10016_rb/8_p/
https://www.zillow.com/homes/for_sale/10016_rb/9_p/
https://www.zillow.com/homes/for_sale/10017_rb/1_p/
https://www.

In [25]:
df_6 = pd.DataFrame(links)
df_6 = df_6.drop_duplicates()
df_6['region'] = 'Gramercy Park and Murray Hill'
df_6.columns = ['link','region']
df_6.to_csv('Gramercy Park and Murray Hill.csv')

In [26]:
links = get_links(zipcode_dict['Central Harlem'] )

https://www.zillow.com/homes/for_sale/10026_rb/1_p/
https://www.zillow.com/homes/for_sale/10026_rb/2_p/
https://www.zillow.com/homes/for_sale/10026_rb/3_p/
https://www.zillow.com/homes/for_sale/10026_rb/4_p/
https://www.zillow.com/homes/for_sale/10026_rb/5_p/
https://www.zillow.com/homes/for_sale/10026_rb/6_p/
https://www.zillow.com/homes/for_sale/10026_rb/7_p/
https://www.zillow.com/homes/for_sale/10026_rb/8_p/
https://www.zillow.com/homes/for_sale/10026_rb/9_p/
https://www.zillow.com/homes/for_sale/10027_rb/1_p/
https://www.zillow.com/homes/for_sale/10027_rb/2_p/
https://www.zillow.com/homes/for_sale/10027_rb/3_p/
https://www.zillow.com/homes/for_sale/10027_rb/4_p/
https://www.zillow.com/homes/for_sale/10027_rb/5_p/
https://www.zillow.com/homes/for_sale/10027_rb/6_p/
https://www.zillow.com/homes/for_sale/10027_rb/7_p/
https://www.zillow.com/homes/for_sale/10027_rb/8_p/
https://www.zillow.com/homes/for_sale/10027_rb/9_p/
https://www.zillow.com/homes/for_sale/10030_rb/1_p/
https://www.

In [28]:
df_7 = pd.DataFrame(links)
df_7 = df_7.drop_duplicates()
df_7['region'] = 'Central Harlem'
df_7.columns = ['link','region']
df_7.to_csv('Central Harlem.csv')

ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [None]:
links = get_links(zipcode_dict['East Harlem'] )

In [None]:
df_8 = pd.DataFrame(links)
df_8 = df_8.drop_duplicates()
df_8['region'] = 'East Harlem'
df_8.to_csv('East Harlem.csv')

In [None]:
links = get_links(zipcode_dict['Inwood and Washington Heights'])

In [None]:
df_9 = pd.DataFrame(links)
df_9 = df_9.drop_duplicates()
df_9['region'] = 'Inwood and Washington Heights'
df_9.to_csv('Inwood and Washington Heights.csv')

In [None]:
links = get_links(zipcode_dict['Roosevelt Island'])

In [None]:
df_10 = pd.DataFrame(links)
df_10 = df_10.drop_duplicates()
df_10['region'] = 'Roosevelt Island'
df_10.to_csv('Roosevelt Island.csv')

In [None]:
frames = [df,df_2,df_3,df_4,df_5,df_6,df_7,df_8,df_9,df_10]

In [None]:
df_all_links = pd.concat(frames) 

In [None]:
df_all_links.columns = ['link','region']

In [None]:
df_all_links = df_all_links.reindex()

In [None]:
df_all_links = df_all_links.drop_duplicates(subset = ['link'])

In [None]:
len(df_all_links)

In [None]:
df_all_links.to_csv('all_links.csv')

In [None]:


<span class="ds-vertical-divider ds-bed-bath-living-area"><span>2</span><span class="ds-summary-row-label-secondary"> bd</span></span>
