# Real Estate Listings on ReMax and Walk Score

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from splinter import Browser
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')
print('Libraries imported!')

Libraries imported!


# Toronto

In [26]:
house_address = []
house_details = []

base_url = 'https://www.remax.ca/on/toronto-real-estate?page='
urls = [base_url + str(x) for x in range(1,301)]

for url in urls:
    # Parse HTML with Beautiful Soup
    time.sleep(5)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    try:
        addresses = soup.find_all('div', class_='left-content flex-one')
        for address in addresses:
            house_address.append(address.text)
    except:
        house_address.append(np.nan)
        
    try:
        details = soup.find_all('div', class_='property-details')
        for detail in details:
            house_details.append(detail.text)
    except:
        house_details.append(np.nan)

In [27]:
address_df = pd.DataFrame(house_address)

new_df = address_df[0].str.split(' ', 2, expand=True)
new_df["price"] = new_df[1].str.replace("$", "")
new_df["price"] = new_df["price"].str.replace(",", "")
new_df["price"] = pd.to_numeric(new_df["price"])

del new_df[0]
del new_df[1]
new_df.head()

Unnamed: 0,2,price
0,"1206 - 170 BAYVIEW AVE, Toronto, ON, M5A 0M4",399999
1,"406 - 155 ST CLAIR AVE W, Toronto, ON, M4V 0A1",899900
2,"1911 - 55 ANN O'REILLY RD, Toronto, ON, M2J 0E1",2300
3,"64 ORTON PARK RD, Toronto, ON, M1G 3G6",610888
4,"318 - 825 CHURCH ST, Toronto, ON, M4W 3Z4",2280


In [28]:
final_df = new_df[2].str.split(', Toronto, ON, ', expand=True)
final_df.head()

Unnamed: 0,0,1
0,1206 - 170 BAYVIEW AVE,M5A 0M4
1,406 - 155 ST CLAIR AVE W,M4V 0A1
2,1911 - 55 ANN O'REILLY RD,M2J 0E1
3,64 ORTON PARK RD,M1G 3G6
4,318 - 825 CHURCH ST,M4W 3Z4


In [29]:
df_add = pd.concat([new_df, final_df], axis=1)
del df_add[2]
df_add.columns = ["price", "address", "postal_code"]
df_add.head()

Unnamed: 0,price,address,postal_code
0,399999,1206 - 170 BAYVIEW AVE,M5A 0M4
1,899900,406 - 155 ST CLAIR AVE W,M4V 0A1
2,2300,1911 - 55 ANN O'REILLY RD,M2J 0E1
3,610888,64 ORTON PARK RD,M1G 3G6
4,2280,318 - 825 CHURCH ST,M4W 3Z4


In [30]:
details = pd.DataFrame(house_details)

details_df_temp = details[0].str.split('|', expand=True)

details_df_temp.head()

Unnamed: 0,0,1,2,3
0,1 bed,1 bath,N/A sqft,condo
1,2 bed,1 bath,N/A sqft,condo
2,2 bed,2 bath,N/A sqft,
3,4 bed,3 bath,N/A sqft,house
4,1 + 1 bed,1 bath,N/A sqft,


In [31]:
details_df_bed = details_df_temp[0].str.replace(' bed', '')
details_df_bath = details_df_temp[1].str.replace(' bath', '')
details_df_area = details_df_temp[2].str.replace(' sqft', '')

In [32]:
details_df_bath_all = details_df_bath.str.split('+', expand=True)
details_df_bath_full = details_df_bath_all[0]
details_df_bath_half = details_df_bath_all[1]

In [33]:
details_df_bed_all = details_df_bed.str.split('+', expand=True)
details_df_bed_full = details_df_bed_all[0]
details_df_bed_half = details_df_bed_all[1]

In [34]:
details_df_bed_full = details_df_bed_full.replace('N/A', np.nan)
details_df_bed_full = pd.to_numeric(details_df_bed_full)
details_df_bed_half = details_df_bed_half.replace('N/A', np.nan)
details_df_bed_half = pd.to_numeric(details_df_bed_half)
details_df_area = details_df_area.replace('N/A', np.nan)
details_df_area = pd.to_numeric(details_df_area)
details_df_bath_full = details_df_bath_full.replace('N/A', np.nan)
details_df_bath_full = pd.to_numeric(details_df_bath_full)
details_df_bath_half = details_df_bath_half.replace('N/A', np.nan)
details_df_bath_half = pd.to_numeric(details_df_bath_half)

In [35]:
data = {'full_bed':details_df_bed_full, 'half_bed':details_df_bed_half, 'full_bath':details_df_bath_full, 'half_bath':details_df_bath_half,
       'property_area':details_df_area, 'property_type':details_df_temp[3]}

In [36]:
details_df = pd.DataFrame(data)
details_df.head()

Unnamed: 0,full_bed,half_bed,full_bath,half_bath,property_area,property_type
0,1.0,,1.0,,,condo
1,2.0,,1.0,,,condo
2,2.0,,2.0,,,
3,4.0,,3.0,,,house
4,1.0,1.0,1.0,,,


In [37]:
toronto_df_dup = pd.concat([df_add, details_df], axis=1)
toronto_df = toronto_df_dup.drop_duplicates()
toronto_df.head()

Unnamed: 0,price,address,postal_code,full_bed,half_bed,full_bath,half_bath,property_area,property_type
0,399999,1206 - 170 BAYVIEW AVE,M5A 0M4,1.0,,1.0,,,condo
1,899900,406 - 155 ST CLAIR AVE W,M4V 0A1,2.0,,1.0,,,condo
2,2300,1911 - 55 ANN O'REILLY RD,M2J 0E1,2.0,,2.0,,,
3,610888,64 ORTON PARK RD,M1G 3G6,4.0,,3.0,,,house
4,2280,318 - 825 CHURCH ST,M4W 3Z4,1.0,1.0,1.0,,,


In [38]:
toronto_df.to_csv('toronto_df.csv', index=False)

----------------

### Walk Score

In [39]:
toronto_df = pd.read_csv('toronto_df.csv')
toronto_df.head()

Unnamed: 0,price,address,postal_code,full_bed,half_bed,full_bath,half_bath,property_area,property_type
0,399999,1206 - 170 BAYVIEW AVE,M5A 0M4,1.0,,1.0,,,condo
1,899900,406 - 155 ST CLAIR AVE W,M4V 0A1,2.0,,1.0,,,condo
2,2300,1911 - 55 ANN O'REILLY RD,M2J 0E1,2.0,,2.0,,,
3,610888,64 ORTON PARK RD,M1G 3G6,4.0,,3.0,,,house
4,2280,318 - 825 CHURCH ST,M4W 3Z4,1.0,1.0,1.0,,,


In [40]:
post_code_list = []

for i in toronto_df["postal_code"]:
    post_code_list.append(i)

In [41]:
scores_walk = []
scores_bike = []
scores_transit = []

for i in post_code_list:

    try:
        postal_code = i.replace(" ", "%20")
        url_score = "https://www.walkscore.com/score/" + str(postal_code)
        time.sleep(5)

        # Parse HTML with Beautiful Soup
        response = requests.get(url_score)
        code_soup = BeautifulSoup(response.text, 'html.parser')

        if 'pp.walk.sc/badge/walk/score' in str(code_soup):
            ws = str(code_soup).split('pp.walk.sc/badge/walk/score/')[1][:2].replace('.','')
            scores_walk.append(ws)
        else:
            ws = 'N/A'
            scores_walk.append(ws)
        if 'pp.walk.sc/badge/bike/score' in str(code_soup):
            bs = str(code_soup).split('pp.walk.sc/badge/bike/score/')[1][:2].replace('.','')
            scores_bike.append(bs)
        else:
            bs = 'N/A'
            scores_bike.append(bs)
        if 'pp.walk.sc/badge/transit/score' in str(code_soup):
            ts = str(code_soup).split('pp.walk.sc/badge/transit/score/')[1][:2].replace('.','')
            scores_transit.append(ts)
        else:
            ts = 'N/A'
            scores_transit.append(ts)
    except:
        ws = 'N/A'
        scores_walk.append(ws)
        bs = 'N/A'
        scores_bike.append(bs)
        ts = 'N/A'
        scores_transit.append(ts)

In [42]:
score_df_trans = {'postal_code':post_code_list, 
                  'walk_score':scores_walk, 
                  'bike_score':scores_bike, 
                  'transit_score':scores_transit}
score_df_dup = pd.DataFrame(score_df_trans)
score_df = score_df_dup.drop_duplicates()
score_df.head()

Unnamed: 0,postal_code,walk_score,bike_score,transit_score
0,M5A 0M4,87,10,10
1,M4V 0A1,81,81,83
2,M2J 0E1,54,50,65
3,M1G 3G6,29,49,56
4,M4W 3Z4,99,85,90


In [43]:
score_df.to_csv('score_df_tor.csv', index=False)

-------------------

# PostgreSQL

In [44]:
toronto_df = pd.read_csv('toronto_df.csv')
score_df = pd.read_csv('score_df_tor.csv')

In [45]:
rds_connection_string = "postgres:123@localhost:5432/realestate_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

toronto_df.to_sql(name= "toronto", con=engine, if_exists="replace", index=False)
score_df.to_sql(name= "score", con=engine, if_exists="append", index=False)

In [18]:
# import pymongo
# from pymongo import MongoClient

In [19]:
# toronto_df_html = toronto_df.to_html()
# score_df_tor_html = score_df_tor.to_html()

# MongoDB

In [46]:
import pymongo
from pymongo import MongoClient

conn = 'mongodb://localhost:27017'
# Making a Connection with MongoClient
client = MongoClient(conn)
# database
db = client.realestate_db

collection = db.toronto
toronto_dict = toronto_df.to_dict("records")
collection.insert_many(toronto_dict)

collection = db.score
score_dict = score_df.to_dict("records")
collection.insert_many(score_dict)

<pymongo.results.InsertManyResult at 0x7ff52b139b40>

# MySQL

In [47]:
engine = create_engine(f'mysql+pymysql://root:Myp@sswordis123@localhost/realestate_db', pool_recycle=3600)

toronto_df.to_sql(name="toronto", con=engine, if_exists="replace", index=False)
score_df.to_sql(name="score", con=engine, if_exists="append", index=False)