In [None]:
!pip install slimit
import requests
from bs4 import BeautifulSoup
import json
import re
from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor
import pandas as pd
import time

In [2]:
# input apartmentfinder url
# returns number of pages
def get_num_pages(url) :
  header = {'Accept': 'text/html', 
            "User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
  response = requests.get(url, headers = header)
  root = BeautifulSoup(response.content, 'html.parser')
  num_pages = int(root.find_all("div", id = "pagingTotal")[0].find_all("span")[0].text.split()[-1])
  return num_pages


In [18]:
# input number of pages and url to get apartment links from
# returns list of links of apartments
def get_links(num_pages, url) :
  header = {'Accept': 'text/html', 
            "User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
  links = []
  for p in range(num_pages) :
    print(p)
    if p != 0 :
      page = url + "/Page" + str(p+1)
    else :
      page = url
    page_response = requests.get(page, headers = header, timeout = 20)
    page_root = BeautifulSoup(page_response.content, 'html.parser')
    apts = json.loads(page_root.find_all("script", type = "application/ld+json", id = "structuredSchemaBreadcrumb")[0].text)["about"]
    for apt in apts :
      links.append(apt["@id"])
    time.sleep(0.5)
  return links

In [38]:
# input apartment page link
# returns BeautifulSoup of apartment page
def get_apt_root(link) :
  header = {'Accept': 'text/html', 
            "User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
  apt_response = requests.get(link, headers = header, timeout = 10)
  apt_root = BeautifulSoup(apt_response.content, 'html.parser')
  return apt_root

# input BeautifulSoup of apartment page
# returns javascript data as dictionary
def get_js(apt_root) :
  apt_info_js = apt_root.find_all("script", type = "text/javascript")
  if len(apt_info_js) == 0 :
    return None
  apt_info_js = apt_root.find_all("script", type = "text/javascript")[-1]
  parser = Parser()
  tree = parser.parse(apt_info_js.text)
  fields = {getattr(node.left, 'value', ''): getattr(node.right, 'value','') for node in nodevisitor.visit(tree) if isinstance(node, ast.Assign)}
  return fields

# input from get_js function
# returns property type
def get_proptype(fields) :
  proptype = fields["propertyType"][1:-1]
  PropertyType = 0
  if proptype == "House" :
    PropertyType = 1
  elif proptype == "Condo" :
    PropertyType = 2
  return PropertyType

# input from get_js function and BeautifulSoup of apartment page
# returns description
def get_description(fields, apt_root) :
  Description = fields["listingDescription"][1:-1]
  amenities_list = apt_root.find_all("section", id = "special-features")
  if len(amenities_list) != 0 :
    amenities_list = apt_root.find_all("section", id = "special-features")[0].find_all("li")
    Amenities = ""
    if len(amenities_list) > 0 :
      Amenities = amenities_list[0].text
    if len(amenities_list) > 1 :
      for a in amenities_list[1:] :
        Amenities += ", " + a.text
    Description += " " + Amenities
  return Description

# input BeautifulSoup of apartment page
# returns address
def get_address(apt_root) :
  Address = apt_root.find_all("span", class_ = "mailing-address")[0].text
  Address = re.sub(" +", " ", Address)
  return Address

# input BeautifulSoup of apartment page
# returns parking price and parking type as tuple
def get_parking(apt_root) :
  ParkingPrice = 0
  ParkingType = 0
  commfeats_list = apt_root.find_all("section", id = "community-features")
  if len(commfeats_list) > 0 :
    commfeats_list = apt_root.find_all("section", id = "community-features")[0].find_all("li")
    for c in commfeats_list :
      if re.search("[Pp]arking", c.text) != None :
        ParkingType = 1
  amenities_list = apt_root.find_all("section", id = "special-features")
  if len(amenities_list) != 0 :
    amenities_list = apt_root.find_all("section", id = "special-features")[0].find_all("li")
    if len(amenities_list) > 1 :
      for a in amenities_list[1:] :
        if re.search("[Pp]arking", a.text) != None :
          ParkingType = 1
  expenses_list = apt_root.find_all("li", class_ = "expense-line")
  for i in range(len(expenses_list)) :
    if re.search("[Pp]arking", expenses_list[i].text) != None :
      ParkingType = 1
      ParkingPrice = int(re.findall("\$[0-9]+", expenses_list[i-1].text)[0][1:])
  return ParkingPrice, ParkingType

# input BeautifulSoup of apartment page
# returns laundry type
def get_laundry(apt_root) :
  Laundry = 0
  commfeats_list = apt_root.find_all("section", id = "community-features")
  if len(commfeats_list) > 0 :
    commfeats_list = apt_root.find_all("section", id = "community-features")[0].find_all("li")
    for c in commfeats_list :
      if c.text == "Laundry Facilities" :
        Laundry = 1
  amenities_list = apt_root.find_all("section", id = "special-features")
  if len(amenities_list) != 0 :
    amenities_list = apt_root.find_all("section", id = "special-features")[0].find_all("li")
    if len(amenities_list) > 1 :
      for a in amenities_list[1:] :
        if re.search("[Ww]asher", a.text) != None or re.search("[Dd]ryer", a.text) != None or re.search("[Ll]aundry", a.text) != None :
          Laundry = 3
  return Laundry

# input BeautifulSoup of apartment page
# returns if apartment has cooling
def get_cooling(apt_root) :
  Cooling = False
  fpfeatures_list = apt_root.find_all("section", id = "floorplan-amenities")
  if len(fpfeatures_list) > 0 :
    fpfeatures_list = apt_root.find_all("section", id = "floorplan-amenities")[0].find_all("li")
    for feat in fpfeatures_list :
      if feat.text == "Air Conditioning" :
        Cooling = True
  return Cooling

# input BeautifulSoup of apartment page
# returns year that apartment building was built
def get_yearbuilt(apt_root) :
  built_text = apt_root.find("div", class_ = "row description-container section-content").text
  built_start = re.search("built in ", built_text)
  YearBuilt = None
  if built_start != None :
    built_start = re.search("built in ", built_text).start()
    if built_start != None :
      y = built_text[built_start + len("built in "):built_start + len("built in ") + 4]
      if y.isdigit() :
        YearBuilt = int(built_text[built_start + len("built in "):built_start + len("built in ") + 4])
  return YearBuilt

# input address
# returns BeautifulSoup of walkscore page
def get_ws_root(Address) :
  address_string = re.sub(",", "", Address.lower())
  address_string = re.sub(" +", "-", address_string)
  walkscore_link = "https://www.walkscore.com/score/" + address_string
  ws_response = requests.get(walkscore_link, headers = header)
  ws_root = BeautifulSoup(ws_response.content, "html.parser")
  return ws_root

# input BeautifulSoup of walkscore page
# returns walkscore
def get_walkscore(ws_root) :
  if len(ws_root) == 0 :
    return None
  WalkScore = None
  ws_text = ws_root.find_all("span", id = "score-description-sentence")[0].text
  ws_start = re.search("Walk Score of ", ws_text)
  if ws_start != None :
    ws_start = re.search("Walk Score of ", ws_text).start()
    ws_slice = ws_text[ws_start + len("Walk Score of "):]
    ws_end = re.search(" ", ws_slice).start()
    WalkScore = int(ws_slice[:ws_end])
  return WalkScore

# input BeautifulSoup of walkscore page
# returns transitscore
def get_transitscore(ws_root) :
  if len(ws_root) == 0 :
    return None
  TransitScore = None
  ws_root_str = str(ws_root)
  ts_start = re.search("//pp\.walk\.sc/badge/transit/score/", ws_root_str)
  if ts_start != None :
    ts_start = ts_start.start()
    ts_slice = ws_root_str[ts_start:]
    start = re.search("[0-9]", ts_slice).start()
    end = re.search(r'\.png', ts_slice).start()
    TransitScore = int(ts_slice[start:end])
  return TransitScore

# input BeautifulSoup of apartment page
# returns 2d list of data
def get_info(apt_root, PropertyType, Description, Address, ParkingPrice, ParkingType, Laundry, Cooling, YearBuilt, WalkScore, TransitScore) :
  data = []
  apt_info_js = apt_root.find_all("script", type = "text/javascript")
  if len(apt_info_js) == 0 :
    return None
  apt_info_js = apt_root.find_all("script", type = "text/javascript")[-1]
  apt_info = apt_info_js.text
  rentals_start = re.search("rentals: ", apt_info).start()
  rentals_end = re.search("fees: ", apt_info).start()
  rentals_info = json.loads(apt_info[rentals_start + len("rentals: "):rentals_end].strip()[:-1])
  rental_types = dict()
  rental_type_in = dict()
  for rental in rentals_info :
    if rental["DateAvailableDisplay"] != "Not Available" :
      price = rental["MinRent"].strip()
      if re.search("[0-9]", price) != None :
        beds = int(rental["Beds"])
        baths = float(rental["Baths"])
        price = int(rental["MinRent"])
        if (beds, baths) not in rental_types.keys() :
          rental_types[(beds,baths)] = price
          rental_type_in[(beds,baths)] = False
        else :
          if rental_types[(beds,baths)] > price :
            rental_types[(beds,baths)] = price
  for rental in rentals_info :
    if rental["DateAvailableDisplay"] != "Not Available" :
      Baths = float(rental["Baths"])
      Beds = int(rental["Beds"])
      Price = rental["MinRent"].strip()
      if re.search("[0-9]", Price) != None :
        Price = int(rental["MinRent"])
        if rental_types[(Beds,Baths)] == Price and rental_type_in[(Beds,Baths)] == False:
          sqft_str = rental["SquareFootDisplay"]
          if sqft_str == None :
            continue
          sqft_num = sqft_str[:re.search(" Sq Ft", sqft_str).start()]
          Sqft = int(re.sub(",", "", sqft_num))
          row = [Price, Address, PropertyType, Beds, Baths, Sqft, YearBuilt, WalkScore, TransitScore, ParkingPrice, ParkingType, Cooling, 
                  Laundry, Description]
          if len(row) == 14 :
            data.append(row)
          rental_type_in[(Beds,Baths)] = True
  return data


In [41]:
# input links
# returns 2d list of data
def get_data(links) :
  data = []
  for link in links :
    apt_root = get_apt_root(link)
    fields = get_js(apt_root)
    if fields != None :
      PropertyType = get_proptype(fields)
      Description = get_description(fields, apt_root)
      Address = get_address(apt_root)
      ParkingPrice, ParkingType = get_parking(apt_root)
      Laundry = get_laundry(apt_root)
      Cooling = get_cooling(apt_root)
      YearBuilt = get_yearbuilt(apt_root)
      ws_root = get_ws_root(Address)
      WalkScore = get_walkscore(ws_root)
      TransitScore = get_transitscore(ws_root)
      apt_data = get_info(apt_root, PropertyType, Description, Address, ParkingPrice, ParkingType, Laundry, Cooling, YearBuilt, WalkScore, TransitScore)
      if apt_data != None :
        for row in apt_data :
          data.append(row)
    time.sleep(0.5)
  return data


In [None]:
main_url = "https://www.apartmentfinder.com/Pennsylvania/Pittsburgh-Apartments"
num_pages = get_num_pages(main_url)
links = get_links(num_pages, main_url)
data = get_data(links)
df = pd.DataFrame(data, columns = ["Price", "Address", "PropertyType", "Beds", "Baths", "Sqft", "YearBuilt", "WalkScore", 
                                   "TransitScore", "ParkingPrice", "ParkingType", "Cooling", "Laundry", "Description"])

print(df.head())

In [None]:
from google.colab import drive
drive.mount('/gdrive')
with open('/gdrive/My Drive/apartmentfinder.csv', 'a') as f:
  df.to_csv(f, header = False, index = False)