
@ author : Ying Liu

# Scrape data from Renthop

# Features:

- Amenities

- price
- listing_id
- hopscore
- street_address
- neighborhoods
- expert
- bedroom
- bathroom
- description
- median_price
- num_transportation
- photo
- No Fee
- transportation




In [7]:
import requests
import urllib
from bs4 import BeautifulSoup
import time
import numpy as np
import re
import pandas as pd
import json

In [8]:
def house_rent(url_link):
    return 'https://www.renthop.com/search/nyc?page=' + url_link

In [9]:
# return each house's details

def getHouseDetail(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    #Description
    description = soup.find_all("div",{"style":"font-size: 0.90em; line-height: 140%;"})[0].text
    
    #photos
    num_photos = soup.find_all("a",{"data-lightbox" : "listing-photos-set"})
    photo_link = [] 
    for i in range(len(num_photos)):
        photo = num_photos[i]['href']
        photo_link.append(photo) # each house has its own list of photo link
    
    # Amenities
    Amenities = []
    for x in range(len(soup.find_all('td',{'style':'vertical-align: top; padding-left: 8px;'}))):
        amenities_name = soup.find_all('td',{'style':'vertical-align: top; padding-left: 8px;'})[x].text.strip()
        Amenities.append(amenities_name)
    
    #median price
   
    median_price_div = soup.find_all('div',{'style': 'text-align: center; font-size: 1.00em;'})
    if len(median_price_div)>0:
        median_price_info = soup.find_all('div',{'style': 'text-align: center; font-size: 1.00em;'})[0].text.strip()
    else:
        median_price_info = 0
        
    #transportation_name
    trans = []
    for t in range(len(soup.find_all('div',{'class':'subway_nyc'}))):
        subway_name = soup.find_all('div',{'class':'subway_nyc'})[t].text.strip()
        trans.append(subway_name)
        
    #num_transportation
    subway = soup.find_all('div',{'class':'subway_nyc'})
    num_subway = len(subway)

    
    return {"Description":description,"Median_price":median_price_info,"num_transportation": num_subway,"photo_link":photo_link,"Amenities":Amenities,"trans":trans}


In [10]:
# return house info on each page
price = []
listing_id = []
hopscore = []
street_address = []
neighborhoods = []
expert = []
bedroom = []
bathroom = []
description = []
median_price = []
num_transportation = []
Amenities = []
photo_link = []
trans = []
# latitude = []
# longitude = []

# here is the page 1 to page 200, 4000 samples
for x in range(1,200):
    url_page = house_rent(str(x))
    r_page = requests.get(url_page)
    soup = BeautifulSoup(r_page.content, "html.parser")
    house_data = soup.find_all("div",{"class" : "search-listing"}) #20 house listings
    
    for house in house_data:
        house_id = house.get('listing_id') # each house has its own listing_id
        
        #url for each house
        house_url = house.find_all('a',{'class':'font-size-90 listing-title-link'},href = True)[0]['href']
        house_detail_info = getHouseDetail(house_url) # reture dictionary 
        
        # return values from dict
        # description
        description_info = house_detail_info["Description"]
        description.append(description_info)

        
        # median_price
        
        median_price_info = house_detail_info["Median_price"]
        median_price.append(median_price_info)

        
        Amenities_info = house_detail_info["Amenities"]
        Amenities.append(Amenities_info)
        
        photo_link_info = house_detail_info["photo_link"]
        photo_link.append(photo_link_info)

        
        trans_info = house_detail_info["trans"]
        trans.append(trans_info)
        
        # num_transportation
        subway_info = house_detail_info["num_transportation"]
        num_transportation.append(subway_info)
        
        # price
        price_id = 'listing-{}-price'.format(house_id)
        price_div = house.find_all('div', {'id' : price_id})[0]
        price.append(price_div.text.strip())
        
        # listing_id
        listing_id.append(int(house_id))
        
        # hopscore
        hopscore_id = 'listing-{}-hopscore'.format(house_id)
        hopscore_div = house.find_all('div',{'id': hopscore_id})[0]
        hopscore.append(float(hopscore_div.text.strip()))
        
        # street_address
        title_id = 'listing-{}-title'.format(house_id)
        title_a = house.find_all('a',{'id': title_id})[0]
        title = title_a.text.strip()
        text = 'at'
        address_name = title[title.find(text) + len(text) + 1:]
        street_address.append(address_name)
        
        # bedroom
        title_tag = title_a.text.strip()
        #number = re.findall('\d+', title_tag)
        
        if len(re.findall('BR',title_tag))>0:
            number = title_tag.find('BR')-1
            bedroom.append(title_tag[number])
        else:
            #only one
            bedroom.append(1)
        
        # bathroom
        if len(re.findall('BA',title_tag))>0:
            number = title_tag.find('BA')-1
            bathroom.append(title_tag[number])
        else:
            #only one
            bathroom.append(1)    
        
        #neighborhoods
        neighbor_id = 'listing-{}-neighborhoods'.format(house_id)
        neighbor = house.find_all('div',{'id': neighbor_id})[0].text.strip()
        neighborhoods.append(neighbor)
        
        # address
#         address = [address_name + ',' + neighbor]

        
        # expert
        expert_div = house.find_all('div',{'class':'font-size-80 overflow-clip'})
        if len(expert_div)>0:
            expert.append(expert_div[0].text.strip())
        else:
            expert.append(0)
            
        #latitude & longitude
        
#         for i in neighborhoods: 
#             google_key="AIzaSyBGkYZ7kecirliN_CmzK_n4Nv5T9uH8uog"   
#             google_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}".format(i,google_key)
#             json_return = requests.get(google_url)
#             json_map_data = json.loads(json_return.text)
#             for js in json_map_data['results']:
#                 lat = js['geometry']['location']['lat']
#                 lng = js['geometry']['location']['lng']
#                 latitude.append(lat)
#                 longitude.append(lng)
        # no fee    
        noFee = []
        for a in Amenities:
            if("No Fee" in a):
                noFee.append(1)
            else:
                noFee.append(0)

In [11]:
data = {'price': price,'listing_id':listing_id,'hopscore':hopscore,'street_address':street_address,'neighborhoods': neighborhoods,
       'expert': expert,'bedroom':bedroom, 'bathroom':bathroom,'description': description,'median_price':median_price,
       'num_transportation':num_transportation,'photo': photo_link,'Amenities': Amenities,'No Fee':noFee,'transportation':trans,}
#         'latitude':latitude,'longitude':longitude
# There is something wrong with longitude and latitude
Rent_house = pd.DataFrame.from_dict(data = data)
Rent_house

Unnamed: 0,Amenities,No Fee,bathroom,bedroom,description,expert,hopscore,listing_id,median_price,neighborhoods,num_transportation,photo,price,street_address,transportation
0,"[No Fee, Featured, Roof Deck, Dining Room, Doo...",1,2,3,\nAMAZING MURRAY HILL SPACIOUS 2 BEDROOM + 2 B...,0,100.0,9004417,"$4,875","Murray Hill, Midtown Manhattan, Manhattan",21,[https://photos.renthop.com/7/9004417_47637c17...,"$4,800",East 39th Street,"[4, 5, 6, 7, S, 6, B, D, F, M, 7, E, 6, M, N, ..."
1,"[No Fee, Featured, Elevator, Laundry In Buildi...",1,1,2,\nMASSIVE & FULLY RENOVATED *TRUE* 2 BEDROOM 1...,Upper East Side Expert,100.0,8135722,"$3,000","Upper East Side, Upper Manhattan, Manhattan",14,[https://photos.renthop.com/7/7581252_49dfa883...,"$3,425",East 78th ESt,"[N, Q, T, 6, 4, 5, 6, F, 4, 5, 6, N, Q, R]"
2,"[Featured, Roof Deck, Doorman, Elevator, Laund...",0,1,1,\n// AMAZING DEAL // LUXURIOUS // MURRAY HILL ...,0,100.0,9024451,"$3,475","Kips Bay, Midtown Manhattan, Manhattan",16,[https://photos.renthop.com/7/9024451_66efbbb8...,"$3,200",East 29th Street,"[6, N, R, 4, 5, 6, 7, S, L, 4, 5, 6, N, Q, R, L]"
3,"[No Fee, Featured, Doorman, Elevator, Pre-War,...",1,2,1,\nNet Rent advertised. Based on 1 Month Free o...,0,100.0,8821026,"$4,150","Koreatown, Midtown Manhattan, Manhattan",26,[https://photos.renthop.com/7/8821026_22d59d29...,"$4,426",50 West 34th Street,"[N, Q, R, B, D, F, M, N, R, 6, B, D, F, M, 7, ..."
4,"[No Fee, Featured, Doorman, Elevator, Pre-War,...",1,2,1,\nNet Rent advertised. Based on 1 Month Free o...,0,100.0,8971416,"$4,150","Koreatown, Midtown Manhattan, Manhattan",26,[https://photos.renthop.com/7/8971416_f538953c...,"$4,315",50 West 34th Street,"[N, Q, R, B, D, F, M, N, R, 6, B, D, F, M, 7, ..."
5,"[No Fee, Featured, elevator, patio, High Ceili...",1,1,3,\n** NO BROKER'S FEE **\n\nThe apartment is in...,0,100.0,8852792,"$3,825","East Williamsburg, Williamsburg, Northern Broo...",9,[https://photos.renthop.com/7/8852792_3f5676ce...,"$3,590","679 Grand Street, Apt 4B","[L, G, L, G, M, J, M, J, Z]"
6,"[Featured, Doorman, Elevator, Pre-War, Dishwas...",0,1,1,\nNet Rent advertised. Based on 1 Month Free o...,0,100.0,8971424,"$3,850","Koreatown, Midtown Manhattan, Manhattan",26,[https://photos.renthop.com/7/8971424_94586bba...,"$3,369",50 West 34th Street,"[N, Q, R, B, D, F, M, N, R, 6, B, D, F, M, 7, ..."
7,"[No Fee, Featured, Exclusive, Doorman, Elevato...",1,1,1,\n\n On the corner of 33rd Street and 1s...,0,100.0,8849569,"$3,475","Kips Bay, Midtown Manhattan, Manhattan",20,[https://photos.renthop.com/7/8849569_b0ded64b...,"$4,000","377 East 33rd Street, Apt 21E","[6, 4, 5, 6, 7, S, N, R, B, D, F, M, 7, N, Q, ..."
8,"[Featured, Doorman, Elevator, Pre-War, Laundry...",0,1,1,\nNewly renovated studio with hardwood floors ...,0,100.0,8929098,"$3,250","Brooklyn Heights, Northwestern Brooklyn, Brooklyn",20,[https://photos.renthop.com/7/8929098_2c7c3705...,"$2,675",Clark Street,"[2, 3, 2, 3, A, C, 4, 5, J, Z, 2, 3, 4, 5, R, ..."
9,"[Featured, Balcony, Elevator, Hardwood Floors]",0,1,1,\nBEAUTIFUL STUDIO located on the Upper East S...,0,100.0,8881581,"$2,100","Yorkville, Upper East Side, Upper Manhattan, M...",4,[https://photos.renthop.com/7/8881581_525cd4fc...,"$2,150",East 84th Street,"[N, Q, T, 6]"


In [6]:
Rent_house.to_csv('/Users/admin/Documents/renthop_house.csv')