## -----------------------------------------------------------------------------------------------------------

<h2 align = 'center'> Toronto Craiglist Rental Condo Web Scraping Project </h2>
<h4 align = 'center'> Group Members: </h4>
<h3 align = 'center'> Jessica Zhong, Barbara Chen, Vaughn Shideler, Karan Teckwani, Clover Guo, Rebecca Lam </h3>

## -----------------------------------------------------------------------------------------------------------

### Import Libraries

In [1]:
# import libraries

import urllib.request as urlrequest # Requesting the Website URL
from urllib.parse import urlparse # Parse the URL
from bs4 import BeautifulSoup # HTML parsing
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
import numpy as np # Stacking the lists into a dataframe
import pandas as pd # Creating a DataFrame from the stacked results
import matplotlib.pyplot as plt # Data Visualization
import seaborn as sns # Data Visualization
%matplotlib inline

### Build search criteria

In [2]:
#Build search criteria

search_query = 'condo'
initial_url = 'https://toronto.craigslist.ca/'
list_of_query_and_url = ['https://toronto.craigslist.ca/search/hhh?query=', search_query, '&sort=rel']

# Join all of our strings together 

final_search_url = ''.join(list_of_query_and_url)
print("Final search url: %s" % (final_search_url))

Final search url: https://toronto.craigslist.ca/search/hhh?query=condo&sort=rel


### Build listing extractor function

In [3]:
def listing_extractor(website):
    '''
    Inputs: a URL from the listing on the search page
            to investigate the listing and 
    Outputs: soup object
    '''
    
    try:
        req = urlrequest.Request(website)
        response = urlrequest.urlopen(req)
        site = response.read()
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = BeautifulSoup(site, 'html.parser') # Get the html from the site
    
    area = soup_obj.find(class_ = 'page-container') # Locating the page-container from the soub object
    
    return area

### Parse the final search url

In [4]:
final_search_url_split_pages = urlparse(final_search_url) # Parse the URL

### Creating empty containers for regex Url's

In [5]:
rental_listing = []      # Regex seperation of all the rental listing
sublet_listing = []      # Regex seperation of all the sublet listing
shared_listing = []      # Regex seperation of all the shared listing
sale_listing = []        # Regex seperation of all the sale listing
other_listing = []       # Regex seperation of all the other listing

### Web scraping all the listing url's from Craiglist condo query search

In [6]:
page_num = 21
for page in range(1, min(page_num+1,999)): # Loop through all of our search result pages
    print('Getting page', page)            # Print the current page number being scraped
    if page == 1:                          # Loop to go to the next page upon scraping the first page
        start_num = ''    
    else:
        start_num = str((page-1)*120)
    
    # search page url for each page
    current_page = ''.join([final_search_url_split_pages.scheme,'://',
                            final_search_url_split_pages.netloc ,
                            final_search_url_split_pages.path ,'?','s=', 
                            start_num, '&', 
                            final_search_url_split_pages.query])  
    
    # Requesting the contents of each page and reading the page
    req_each_page = urlrequest.Request(current_page)
    response_each_page = urlrequest.urlopen(req_each_page)
    read_each_page = response_each_page.read()
        
    # Creating the BeautifulSoup object of each page and finding the URL of each listing on that particular page    
    soup_obj_of_each_page = BeautifulSoup(read_each_page, 'html.parser')
    locating_each_url = soup_obj_of_each_page.findAll("li", {"class" : "result-row"})
    result_of_each_url = [result_row.find('a') for result_row in locating_each_url]
    list_of_url = [str(result_row.get('href')) for result_row in result_of_each_url]
    
    # Regex search on the list of url to seperate the listing into different categories for refined data analysis
    for eachurl in list_of_url:
        rentals = re.search(r'https://toronto.craigslist.ca/\w\w\w/apa/*', eachurl)
        sublets = re.search(r'https://toronto.craigslist.ca/\w\w\w/sub/*', eachurl)
        shared = re.search(r'https://toronto.craigslist.ca/\w\w\w/roo/*', eachurl)
        sale = re.search(r'https://toronto.craigslist.ca/\w\w\w/reb|o/*', eachurl)
        if rentals:
            rental_listing.append(eachurl)
        elif sublets:
            sublet_listing.append(eachurl)
        elif shared:
            shared_listing.append(eachurl)
        elif sale:
            sale_listing.append(eachurl)
        else:
            other_listing.append(eachurl)

    sleep(1) # So that we don't be jerks.              

print('Done collecting the craiglist condo postings!')
print('There were %s results found on each page.' % (len(result_of_each_url)))
print('There were total of %s listing url\'s found on craiglist condo search.' % (len(list_of_url)))
print('There were total of %s rental listing url\'s found on craiglist condo search.' % (len(rental_listing)))
print('There were total of %s sublet listing url\'s found on craiglist condo search.' % (len(sublet_listing)))
print('There were total of %s shared listing url\'s found on craiglist condo search.' % (len(shared_listing)))
print('There were total of %s sale listing url\'s found on craiglist condo search.' % (len(sale_listing)))
print('There were total of %s other listing url\'s found on craiglist condo search.' % (len(other_listing)))

Getting page 1
Getting page 2
Getting page 3
Getting page 4
Getting page 5
Getting page 6
Getting page 7
Getting page 8
Getting page 9
Getting page 10
Getting page 11
Getting page 12
Getting page 13
Getting page 14
Getting page 15
Getting page 16
Getting page 17
Getting page 18
Getting page 19
Getting page 20
Getting page 21
Done collecting the craiglist condo postings!
There were 0 results found on each page.
There were total of 0 listing url's found on craiglist condo search.
There were total of 1665 rental listing url's found on craiglist condo search.
There were total of 56 sublet listing url's found on craiglist condo search.
There were total of 17 shared listing url's found on craiglist condo search.
There were total of 662 sale listing url's found on craiglist condo search.
There were total of 0 other listing url's found on craiglist condo search.


### Creating empty containers for rental listing url's

In [7]:
rental_url = []                 # List of all the Rental Url scraped from the website
rental_listing_id = []          # List of all the craiglist listing id's scraped from each rental url
rental_price = []               # List of all the price information scraped from each rental url
rental_layout = []              # List of all the layout information scraped from each rental url
rental_bedroom = []             # List of all the bedroom information scraped from each rental url
rental_bathroom = []            # List of all the bathroom information scraped from each rental url
rental_sqft = []                # List of all the square feet information scraped from each rental url
rental_description = []         # List of all the description information scraped from each rental url
rental_latitude = []            # List of all the latitude information scraped from each rental url
rental_longitude = []           # List of all the longitude information scraped from each rental url
rental_map_text = []            # List of all the map text information scraped from each rental url
rental_google_map_url = []      # List of all the google map url information scraped from each rental url

### Web scraping all the rental listing url's from rental_listing

In [8]:
# Scraping the content of each url on the search page from the rental listing   
for rentalurl in range(len(rental_listing)):
    information = listing_extractor(rental_listing[rentalurl])
    rental_url.append(rental_listing[rentalurl])
    
    # Extracting the craiglist listing ID of each individual listing
    try:
        posting_info = information.find("div", {"class":"postinginfos"})
        locate_id = posting_info.find("p",{"class":"postinginfo"})
        id_only = re.findall(r'\d+', locate_id.text)
        result = '%s' % "','".join(id_only)
        if result:
            rental_listing_id.append(result)
    except:
        no_id = 'Missing_Data' 
        rental_listing_id.append(no_id)
    
    # Extracting the price of each individual listing
    try:
        price_info = information.find("span", {"class":"price"}).string
        only_price = re.findall(r'\d+', price_info)
        if only_price:
            rental_price.append(only_price[0])
    except:
        no_price = 'Missing_Data'
        rental_price.append(no_price)
            
    # Extracting the layout of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        if only_layout:
            rental_layout.append(only_layout)
    except:
        no_layout = 'Missing_Data'
        rental_layout.append(no_layout)
    
    # Extracting the bedroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bedroom_info = regex_layout[0]
            rental_bedroom.append(bedroom_info)
    except:
        no_bedroom = 'Missing_Data'
        rental_bedroom.append(no_bedroom)
        
    # Extracting the bathroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bathroom_info = regex_layout[1]
            rental_bathroom.append(bathroom_info)
    except:
        no_bathroom = 'Missing_Data'
        rental_bathroom.append(no_bathroom)
    
    # Extracting the square feet of each individual listing 
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        squarefeet_info = layout_info[1].text
        regex_sqft = re.findall(r'\d{3,4}ft\d', squarefeet_info)
        if regex_sqft:
            only_sqft = re.findall(r'\d+', regex_sqft[0])
            if only_sqft:
                rental_sqft.append(only_sqft[0])
        else:
            try:
                description_information = information.find("section",
                                                           {"id" :"postingbody"}).findAll(text=True,
                                                                                          recursive=False)                          
                description_info = ''.join(description_information)
                initial_regex_search = re.findall(r'(\d{3,4}|\d+[\,]?\d+)[\s]?[S|s][q|f]', description_info)
                if initial_regex_search:
                    rental_sqft.append(initial_regex_search[0])
                else:
                    second_regex_search = re.findall(r'\w+[\s]?\w+[\s]?:[\s]?\d+[-]?\d+', description_info)
                    if second_regex_search:
                        regex_digits_only = re.findall(r'\d+[-]?\d+', second_regex_search)
                        if regex_digits_only:
                            rental_sqft.append(regex_digits_only[0])
                        else:
                            no_regex_digits = "Missing_Data"
                            rental_sqft.append(no_regex_digits)
                    else:
                        no_second_regex_search = "Missing_Data"
                        rental_sqft.append(no_second_regex_search)
            except:
                no_description = 'Missing_Data'
                rental_sqft.append(no_description)
    except:
        no_listing = 'Missing_Data'
        rental_sqft.append(no_listing)
        
    # Extracting the latitude of each individual listing
    try:
        location_info = information.find("div",{"class": "viewposting"})
        latitude_info = location_info.get("data-latitude")
        if latitude_info:
            rental_latitude.append(latitude_info)
    except:
        no_latitude_info = 'Missing_Data'
        rental_latitude.append(no_latitude_info)
                
    # Extracting the longitude of each individual listing    
    try:
        longitude_info = location_info.get("data-longitude")
        if longitude_info:
            rental_longitude.append(longitude_info)
    except:
        no_longitude_info = 'Missing_Data'
        rental_longitude.append(no_longitude_info)
            
    # Extracting the text below map of each individual listing
    try:
        map_text_info = information.find("div",{"class":"mapaddress"})
        only_map_text = map_text_info.text
        if only_map_text:
            rental_map_text.append(only_map_text)
    except:
        no_map_text = "Missing_Data"
        rental_map_text.append(no_map_text)
                
    # Extracting the text below map of each individual listing
    try:
        google_map_url_info = information.find("p",{"class":"mapaddress"})
        only_google_map_url = google_map_url_info.find("a").get("href")
        if only_google_map_url:
            rental_google_map_url.append(only_google_map_url)
    except:
        no_google_map_url = 'Missing_Data'
        rental_google_map_url.append(no_google_map_url)
    
    # Extracting the description of each individual listing 
    try:
        description_information = information.find("section",{"id" :"postingbody"}).findAll(text=True, recursive=False)                          
        description_info = ''.join(description_information)
        if description_info:
            rental_description.append(description_info)
    except:
        no_description = "Missing_Data"
        rental_description.append(no_description)
    
    sleep(2)

print('There were %s rental url\'s scraped.' % (len(rental_url)))
print('There were %s rental listing id\'s scraped.' % (len(rental_listing_id)))   
print('There were %s rental price scraped.' % (len(rental_price)))
print('There were %s rental layout scraped.' % (len(rental_layout)))
print('There were %s rental bedroom scraped.' % (len(rental_bedroom))) 
print('There were %s rental bathroom scraped.' % (len(rental_bathroom)))
print('There were %s rental square feet scraped.' % (len(rental_sqft)))
print('There were %s rental description scraped.' % (len(rental_description)))
print('There were %s rental latitude scraped.' % (len(rental_latitude)))
print('There were %s rental longitude scraped.' % (len(rental_longitude))) 
print('There were %s rental map text scraped.' % (len(rental_map_text))) 
print('There were %s rental google map url scraped.' % (len(rental_google_map_url))) 

There were 1665 rental url's scraped.
There were 1665 rental listing id's scraped.
There were 1665 rental price scraped.
There were 1665 rental layout scraped.
There were 1665 rental bedroom scraped.
There were 1665 rental bathroom scraped.
There were 1665 rental square feet scraped.
There were 1665 rental description scraped.
There were 1665 rental latitude scraped.
There were 1665 rental longitude scraped.
There were 1665 rental map text scraped.
There were 1665 rental google map url scraped.


### Creating the Dataframe using pandas library

In [9]:
import pandas as pd
import numpy as np
craiglist_rental_dataset = pd.DataFrame(np.column_stack([rental_listing_id, rental_price, rental_layout, 
                                                         rental_bedroom, rental_bathroom, rental_sqft, 
                                                         rental_latitude, rental_longitude, rental_url, 
                                                         rental_map_text, rental_google_map_url, 
                                                         rental_description]),
                                        columns = ['rental_listing_id','rental_price', 'rental_layout',
                                                   'rental_bedroom', 'rental_bathroom', 'rental_sqft', 
                                                   'rental_latitude', 'rental_longitude', 'rental_url', 
                                                   'rental_map_text', 'rental_google_map_url', 'description'])

In [10]:
craiglist_rental_dataset.head()

Unnamed: 0,rental_listing_id,rental_price,rental_layout,rental_bedroom,rental_bathroom,rental_sqft,rental_latitude,rental_longitude,rental_url,rental_map_text,rental_google_map_url,description
0,6466154515,1048,1BR / 1Ba,1,1,Missing_Data,Missing_Data,Missing_Data,https://toronto.craigslist.ca/drh/apa/d/great-...,Missing_Data,Missing_Data,"\n\nI have a nice 1 bedroom, 1 bath condo for ..."
1,6466158349,1075,2BR / 1Ba,2,1,Missing_Data,Missing_Data,Missing_Data,https://toronto.craigslist.ca/mss/apa/d/huge-2...,Missing_Data,Missing_Data,"\n\nI have a nice 2 bedroom, 1 bath condo apar..."
2,6466159554,1275,3BR / 2Ba,3,2,Missing_Data,Missing_Data,Missing_Data,https://toronto.craigslist.ca/tor/apa/d/3-bedr...,Missing_Data,Missing_Data,"\n\nI have a nice 3 bedroom, 1.5 bath apartmen..."
3,6466161138,895,2BR / 1Ba,2,1,Missing_Data,Missing_Data,Missing_Data,https://toronto.craigslist.ca/tor/apa/d/great-...,Missing_Data,Missing_Data,"\n\nI have a nice 2 bedroom, 1 bath condo+den...."
4,6466163942,1125,2BR / 1Ba,2,1,Missing_Data,Missing_Data,Missing_Data,https://toronto.craigslist.ca/bra/apa/d/nice-2...,Missing_Data,Missing_Data,"\n\nI have a nice 2 bedroom, 1 bath Condo in b..."


In [11]:
craiglist_rental_dataset.tail()

Unnamed: 0,rental_listing_id,rental_price,rental_layout,rental_bedroom,rental_bathroom,rental_sqft,rental_latitude,rental_longitude,rental_url,rental_map_text,rental_google_map_url,description
1660,6460406155,2400,1BR / 1Ba,1,1,550,43.641638,-79.380944,https://toronto.craigslist.ca/tor/apa/d/fully-...,12 York St,https://maps.google.com/?q=loc%3A+%31%32+York+...,\n\nFully Furnished Executive 1 Bedroom + Stud...
1661,6460385370,2280,1BR / 1Ba,1,1,550,43.641093,-79.380324,https://toronto.craigslist.ca/tor/apa/d/brand-...,88 Harbour St,https://maps.google.com/?q=loc%3A+%38%38+Harbo...,\n\nBrand New Open Concept 1 B! One Of The Bes...
1662,6460379347,1900,1BR / 1Ba,1,1,Missing_Data,43.8227,-79.3946,https://toronto.craigslist.ca/yrk/apa/d/fully-...,7171 yonge st,https://maps.google.com/?q=loc%3A+%37%31%37%31...,"\n\nFully furnished 1+den condo, high floor, p..."
1663,6460381895,2250,2BR / 2Ba,2,2,850,43.624511,-79.489637,https://toronto.craigslist.ca/tor/apa/d/gorgeo...,165 Legion Rd N,https://maps.google.com/?q=loc%3A+%31%36%35+Le...,\n\nGorgeous Two Bedroom 2 Bath Condo (The Cal...
1664,6460378210,2200,1BR / 1Ba,1,1,750,43.667778,-79.391005,https://toronto.craigslist.ca/tor/apa/d/beauti...,110 Charles St,https://maps.google.com/?q=loc%3A+%31%31%30+Ch...,\n\nBeautifully Maintained One Bedroom + Den S...


### Exporting the DataFrame into Excel file to store the results

In [12]:
craiglist_rental_dataset.to_excel("Final_Toronto_Craiglist_Condo_Rental_dataset_Jan_20.xlsx")

### Creating empty containers for sublet listing url's

In [13]:
sublet_url = []                 # List of all the sublet Url scraped from the website
sublet_listing_id = []          # List of all the craiglist listing id's scraped from each sublet url
sublet_price = []               # List of all the price information scraped from each sublet url
sublet_layout = []              # List of all the layout information scraped from each sublet url
sublet_bedroom = []             # List of all the bedroom information scraped from each sublet url
sublet_bathroom = []            # List of all the bathroom information scraped from each sublet url
sublet_sqft = []                # List of all the square feet information scraped from each sublet url
sublet_description = []         # List of all the description information scraped from each sublet url
sublet_latitude = []            # List of all the latitude information scraped from each sublet url
sublet_longitude = []           # List of all the longitude information scraped from each sublet url
sublet_map_text = []            # List of all the map text information scraped from each sublet url
sublet_google_map_url = []      # List of all the google map url information scraped from each sublet url

### Web scraping all the sublet listing url's from sublet_listing

In [14]:
# Scraping the content of each url on the search page from the sublet listing   
for subleturl in range(len(sublet_listing)):
    information = listing_extractor(sublet_listing[subleturl])
    sublet_url.append(sublet_listing[subleturl])
    
    # Extracting the craiglist listing ID of each individual listing
    try:
        posting_info = information.find("div", {"class":"postinginfos"})
        locate_id = posting_info.find("p",{"class":"postinginfo"})
        id_only = re.findall(r'\d+', locate_id.text)
        result = '%s' % "','".join(id_only)
        if result:
            sublet_listing_id.append(result)
    except:
        no_id = 'Missing_Data' 
        sublet_listing_id.append(no_id)
    
    # Extracting the price of each individual listing
    try:
        price_info = information.find("span", {"class":"price"}).string
        only_price = re.findall(r'\d+', price_info)
        if only_price:
            sublet_price.append(only_price[0])
    except:
        no_price = 'Missing_Data'
        sublet_price.append(no_price)
            
    # Extracting the layout of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        if only_layout:
            sublet_layout.append(only_layout)
    except:
        no_layout = 'Missing_Data'
        sublet_layout.append(no_layout)
    
    # Extracting the bedroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bedroom_info = regex_layout[0]
            sublet_bedroom.append(bedroom_info)
    except:
        no_bedroom = 'Missing_Data'
        sublet_bedroom.append(no_bedroom)
        
    # Extracting the bathroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bathroom_info = regex_layout[1]
            sublet_bathroom.append(bathroom_info)
    except:
        no_bathroom = 'Missing_Data'
        sublet_bathroom.append(no_bathroom)
    
    # Extracting the square feet of each individual listing 
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        squarefeet_info = layout_info[1].text
        regex_sqft = re.findall(r'\d{3,4}ft\d', squarefeet_info)
        if regex_sqft:
            only_sqft = re.findall(r'\d+', regex_sqft[0])
            if only_sqft:
                sublet_sqft.append(only_sqft[0])
        else:
            try:
                description_information = information.find("section",
                                                           {"id" :"postingbody"}).findAll(text=True,
                                                                                          recursive=False)                          
                description_info = ''.join(description_information)
                initial_regex_search = re.findall(r'(\d{3,4}|\d+[\,]?\d+)[\s]?[S|s][q|f]', description_info)
                if initial_regex_search:
                    sublet_sqft.append(initial_regex_search[0])
                else:
                    second_regex_search = re.findall(r'\w+[\s]?\w+[\s]?:[\s]?\d+[-]?\d+', description_info)
                    if second_regex_search:
                        regex_digits_only = re.findall(r'\d+[-]?\d+', second_regex_search)
                        if regex_digits_only:
                            sublet_sqft.append(regex_digits_only[0])
                        else:
                            no_regex_digits = "Missing_Data"
                            sublet_sqft.append(no_regex_digits)
                    else:
                        no_second_regex_search = "Missing_Data"
                        sublet_sqft.append(no_second_regex_search)
            except:
                no_description = 'Missing_Data'
                sublet_sqft.append(no_description)
    except:
        no_listing = 'Missing_Data'
        sublet_sqft.append(no_listing)
        
    # Extracting the latitude of each individual listing
    try:
        location_info = information.find("div",{"class": "viewposting"})
        latitude_info = location_info.get("data-latitude")
        if latitude_info:
            sublet_latitude.append(latitude_info)
    except:
        no_latitude_info = 'Missing_Data'
        sublet_latitude.append(no_latitude_info)
                
    # Extracting the longitude of each individual listing    
    try:
        longitude_info = location_info.get("data-longitude")
        if longitude_info:
            sublet_longitude.append(longitude_info)
    except:
        no_longitude_info = 'Missing_Data'
        sublet_longitude.append(no_longitude_info)
            
    # Extracting the text below map of each individual listing
    try:
        map_text_info = information.find("div",{"class":"mapaddress"})
        only_map_text = map_text_info.text
        if only_map_text:
            sublet_map_text.append(only_map_text)
    except:
        no_map_text = "Missing_Data"
        sublet_map_text.append(no_map_text)
                
    # Extracting the text below map of each individual listing
    try:
        google_map_url_info = information.find("p",{"class":"mapaddress"})
        only_google_map_url = google_map_url_info.find("a").get("href")
        if only_google_map_url:
            sublet_google_map_url.append(only_google_map_url)
    except:
        no_google_map_url = 'Missing_Data'
        sublet_google_map_url.append(no_google_map_url)
    
    # Extracting the description of each individual listing 
    try:
        description_information = information.find("section",{"id" :"postingbody"}).findAll(text=True, recursive=False)                          
        description_info = ''.join(description_information)
        if description_info:
            sublet_description.append(description_info)
    except:
        no_description = "Missing_Data"
        sublet_description.append(no_description)
    
    sleep(2)

print('There were %s sublet url\'s scraped.' % (len(sublet_url)))
print('There were %s sublet listing id\'s scraped.' % (len(sublet_listing_id)))   
print('There were %s sublet price scraped.' % (len(sublet_price)))
print('There were %s sublet layout scraped.' % (len(sublet_layout)))
print('There were %s sublet bedroom scraped.' % (len(sublet_bedroom))) 
print('There were %s sublet bathroom scraped.' % (len(sublet_bathroom)))
print('There were %s sublet square feet scraped.' % (len(sublet_sqft)))
print('There were %s sublet description scraped.' % (len(sublet_description)))
print('There were %s sublet latitude scraped.' % (len(sublet_latitude)))
print('There were %s sublet longitude scraped.' % (len(sublet_longitude))) 
print('There were %s sublet map text scraped.' % (len(sublet_map_text))) 
print('There were %s sublet google map url scraped.' % (len(sublet_google_map_url))) 

There were 56 sublet url's scraped.
There were 56 sublet listing id's scraped.
There were 56 sublet price scraped.
There were 56 sublet layout scraped.
There were 56 sublet bedroom scraped.
There were 56 sublet bathroom scraped.
There were 56 sublet square feet scraped.
There were 56 sublet description scraped.
There were 56 sublet latitude scraped.
There were 56 sublet longitude scraped.
There were 56 sublet map text scraped.
There were 56 sublet google map url scraped.


### Creating the Dataframe using pandas library

In [15]:
craiglist_sublet_dataset = pd.DataFrame(np.column_stack([sublet_listing_id, sublet_price, sublet_layout, 
                                                         sublet_bedroom, sublet_bathroom, sublet_sqft, 
                                                         sublet_latitude, sublet_longitude, sublet_url, 
                                                         sublet_map_text, sublet_google_map_url, 
                                                         sublet_description]),
                                        columns = ['sublet_listing_id','sublet_price', 'sublet_layout',
                                                   'sublet_bedroom', 'sublet_bathroom', 'sublet_sqft', 
                                                   'sublet_latitude', 'sublet_longitude', 'sublet_url', 
                                                   'sublet_map_text', 'sublet_google_map_url', 'sublet_description'])

### Exporting the DataFrame into Excel file to store the results

In [16]:
craiglist_sublet_dataset.to_excel("Final_Toronto_Craiglist_Condo_Sublet_dataset_Jan_20.xlsx")

In [17]:
craiglist_sublet_dataset.head()

Unnamed: 0,sublet_listing_id,sublet_price,sublet_layout,sublet_bedroom,sublet_bathroom,sublet_sqft,sublet_latitude,sublet_longitude,sublet_url,sublet_map_text,sublet_google_map_url,sublet_description
0,6465968265,2375,1BR / 1Ba,1,1,600,43.671813,-79.388104,https://toronto.craigslist.ca/tor/sub/d/yorkvi...,18 Yorkville Ave,https://maps.google.com/?q=loc%3A+%31%38+Yorkv...,\n\nLOCATION: \nGorgeous Condo In An Exclusive...
1,6465949142,80,available jan 19,19,Missing_Data,Missing_Data,43.7223,-79.4504,https://toronto.craigslist.ca/tor/sub/d/execut...,Missing_Data,https://maps.google.com/maps/preview/@43.72230...,\n\nThis is a Much Better Alternative to Hotel...
2,6465963479,50,available jan 19,19,Missing_Data,Missing_Data,43.7223,-79.4504,https://toronto.craigslist.ca/tor/sub/d/small-...,Missing_Data,https://maps.google.com/maps/preview/@43.72230...,\n\nThis is a much better alternative to hotel...
3,6465959712,60,available jan 19,19,Missing_Data,Missing_Data,43.7223,-79.4504,https://toronto.craigslist.ca/tor/sub/d/clean-...,Missing_Data,https://maps.google.com/maps/preview/@43.72230...,\n\nThis is a Much Better Alternative to Hotel...
4,6465953687,170,available jan 19,19,Missing_Data,Missing_Data,43.7223,-79.4504,https://toronto.craigslist.ca/tor/sub/d/furnis...,Missing_Data,https://maps.google.com/maps/preview/@43.72230...,\n\nThe Entire Suite is fully furnished. \nThi...


### Creating empty containers for shared listing url's

In [18]:
shared_url = []                 # List of all the shared Url scraped from the website
shared_listing_id = []          # List of all the craiglist listing id's scraped from each shared url
shared_price = []               # List of all the price information scraped from each shared url
shared_layout = []              # List of all the layout information scraped from each shared url
shared_bedroom = []             # List of all the bedroom information scraped from each shared url
shared_bathroom = []            # List of all the bathroom information scraped from each shared url
shared_sqft = []                # List of all the square feet information scraped from each shared url
shared_description = []         # List of all the description information scraped from each shared url
shared_latitude = []            # List of all the latitude information scraped from each shared url
shared_longitude = []           # List of all the longitude information scraped from each shared url
shared_map_text = []            # List of all the map text information scraped from each shared url
shared_google_map_url = []      # List of all the google map url information scraped from each shared url

### Web scraping all the shared listing url's from shared_listing

In [19]:
# Scraping the content of each url on the search page from the shared listing   
for sharedurl in range(len(shared_listing)):
    information = listing_extractor(shared_listing[sharedurl])
    shared_url.append(shared_listing[sharedurl])
    
    # Extracting the craiglist listing ID of each individual listing
    try:
        posting_info = information.find("div", {"class":"postinginfos"})
        locate_id = posting_info.find("p",{"class":"postinginfo"})
        id_only = re.findall(r'\d+', locate_id.text)
        result = '%s' % "','".join(id_only)
        if result:
            shared_listing_id.append(result)
    except:
        no_id = 'Missing_Data' 
        shared_listing_id.append(no_id)
    
    # Extracting the price of each individual listing
    try:
        price_info = information.find("span", {"class":"price"}).string
        only_price = re.findall(r'\d+', price_info)
        if only_price:
            shared_price.append(only_price[0])
    except:
        no_price = 'Missing_Data'
        shared_price.append(no_price)
            
    # Extracting the layout of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        if only_layout:
            shared_layout.append(only_layout)
    except:
        no_layout = 'Missing_Data'
        shared_layout.append(no_layout)
    
    # Extracting the bedroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bedroom_info = regex_layout[0]
            shared_bedroom.append(bedroom_info)
    except:
        no_bedroom = 'Missing_Data'
        shared_bedroom.append(no_bedroom)
        
    # Extracting the bathroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bathroom_info = regex_layout[1]
            shared_bathroom.append(bathroom_info)
    except:
        no_bathroom = 'Missing_Data'
        shared_bathroom.append(no_bathroom)
    
    # Extracting the square feet of each individual listing 
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        squarefeet_info = layout_info[1].text
        regex_sqft = re.findall(r'\d{3,4}ft\d', squarefeet_info)
        if regex_sqft:
            only_sqft = re.findall(r'\d+', regex_sqft[0])
            if only_sqft:
                shared_sqft.append(only_sqft[0])
        else:
            try:
                description_information = information.find("section",
                                                           {"id" :"postingbody"}).findAll(text=True,
                                                                                          recursive=False)                          
                description_info = ''.join(description_information)
                initial_regex_search = re.findall(r'(\d{3,4}|\d+[\,]?\d+)[\s]?[S|s][q|f]', description_info)
                if initial_regex_search:
                    shared_sqft.append(initial_regex_search[0])
                else:
                    second_regex_search = re.findall(r'\w+[\s]?\w+[\s]?:[\s]?\d+[-]?\d+', description_info)
                    if second_regex_search:
                        regex_digits_only = re.findall(r'\d+[-]?\d+', second_regex_search)
                        if regex_digits_only:
                            shared_sqft.append(regex_digits_only[0])
                        else:
                            no_regex_digits = "Missing_Data"
                            shared_sqft.append(no_regex_digits)
                    else:
                        no_second_regex_search = "Missing_Data"
                        shared_sqft.append(no_second_regex_search)
            except:
                no_description = 'Missing_Data'
                shared_sqft.append(no_description)
    except:
        no_listing = 'Missing_Data'
        shared_sqft.append(no_listing)
        
    # Extracting the latitude of each individual listing
    try:
        location_info = information.find("div",{"class": "viewposting"})
        latitude_info = location_info.get("data-latitude")
        if latitude_info:
            shared_latitude.append(latitude_info)
    except:
        no_latitude_info = 'Missing_Data'
        shared_latitude.append(no_latitude_info)
                
    # Extracting the longitude of each individual listing    
    try:
        longitude_info = location_info.get("data-longitude")
        if longitude_info:
            shared_longitude.append(longitude_info)
    except:
        no_longitude_info = 'Missing_Data'
        shared_longitude.append(no_longitude_info)
            
    # Extracting the text below map of each individual listing
    try:
        map_text_info = information.find("div",{"class":"mapaddress"})
        only_map_text = map_text_info.text
        if only_map_text:
            shared_map_text.append(only_map_text)
    except:
        no_map_text = "Missing_Data"
        shared_map_text.append(no_map_text)
                
    # Extracting the text below map of each individual listing
    try:
        google_map_url_info = information.find("p",{"class":"mapaddress"})
        only_google_map_url = google_map_url_info.find("a").get("href")
        if only_google_map_url:
            shared_google_map_url.append(only_google_map_url)
    except:
        no_google_map_url = 'Missing_Data'
        shared_google_map_url.append(no_google_map_url)
    
    # Extracting the description of each individual listing 
    try:
        description_information = information.find("section",{"id" :"postingbody"}).findAll(text=True, recursive=False)                          
        description_info = ''.join(description_information)
        if description_info:
            shared_description.append(description_info)
    except:
        no_description = "Missing_Data"
        shared_description.append(no_description)
    
    sleep(2)

print('There were %s shared url\'s scraped.' % (len(shared_url)))
print('There were %s shared listing id\'s scraped.' % (len(shared_listing_id)))   
print('There were %s shared price scraped.' % (len(shared_price)))
print('There were %s shared layout scraped.' % (len(shared_layout)))
print('There were %s shared bedroom scraped.' % (len(shared_bedroom))) 
print('There were %s shared bathroom scraped.' % (len(shared_bathroom)))
print('There were %s shared square feet scraped.' % (len(shared_sqft)))
print('There were %s shared description scraped.' % (len(shared_description)))
print('There were %s shared latitude scraped.' % (len(shared_latitude)))
print('There were %s shared longitude scraped.' % (len(shared_longitude))) 
print('There were %s shared map text scraped.' % (len(shared_map_text))) 
print('There were %s shared google map url scraped.' % (len(shared_google_map_url))) 

There were 17 shared url's scraped.
There were 17 shared listing id's scraped.
There were 17 shared price scraped.
There were 17 shared layout scraped.
There were 17 shared bedroom scraped.
There were 17 shared bathroom scraped.
There were 17 shared square feet scraped.
There were 17 shared description scraped.
There were 17 shared latitude scraped.
There were 17 shared longitude scraped.
There were 17 shared map text scraped.
There were 17 shared google map url scraped.


### Creating the Dataframe using pandas library

In [20]:
craiglist_shared_dataset = pd.DataFrame(np.column_stack([shared_listing_id, shared_price, shared_layout, 
                                                         shared_bedroom, shared_bathroom, shared_sqft, 
                                                         shared_latitude, shared_longitude, shared_url, 
                                                         shared_map_text, shared_google_map_url, 
                                                         shared_description]),
                                        columns = ['shared_listing_id','shared_price', 'shared_layout',
                                                   'shared_bedroom', 'shared_bathroom', 'shared_sqft', 
                                                   'shared_latitude', 'shared_longitude', 'shared_url', 
                                                   'shared_map_text', 'shared_google_map_url', 'shared_description'])

In [21]:
craiglist_shared_dataset.head()

Unnamed: 0,shared_listing_id,shared_price,shared_layout,shared_bedroom,shared_bathroom,shared_sqft,shared_latitude,shared_longitude,shared_url,shared_map_text,shared_google_map_url,shared_description
0,6466160211,595,available feb 1,1,Missing_Data,Missing_Data,Missing_Data,Missing_Data,https://toronto.craigslist.ca/tor/roo/d/roomma...,Missing_Data,Missing_Data,\n\nGreat apartment in Toronto. Its a 2 bedroo...
1,6465918192,500,available jan 19,19,Missing_Data,Missing_Data,43.665906,-79.385854,https://toronto.craigslist.ca/tor/roo/d/lookin...,Missing_Data,https://maps.google.com/maps/preview/@43.66590...,"\n\nI'm 35/m, looking for a female room mate t..."
2,6465974317,675,available feb 1,1,Missing_Data,Missing_Data,43.642946,-79.407386,https://toronto.craigslist.ca/tor/roo/d/room-f...,801 king st west,https://maps.google.com/?q=loc%3A+%38%30%31+ki...,"\n\nRoom Available for rent, Room is cozy and ..."
3,6459430804,1500,available jan 14,14,Missing_Data,Missing_Data,43.644649,-79.523574,https://toronto.craigslist.ca/tor/roo/d/amazin...,Missing_Data,https://maps.google.com/maps/preview/@43.64464...,\n\nLocation location location!!\n\nI am looki...
4,6458490025,750,available jan 13,13,Missing_Data,Missing_Data,43.670371,-79.405301,https://toronto.craigslist.ca/tor/roo/d/furnis...,Missing_Data,https://maps.google.com/maps/preview/@43.67037...,\n\nShared accommodation available immediately...


### Exporting the DataFrame into Excel file to store the results

In [22]:
craiglist_shared_dataset.to_excel("Final_Toronto_Craiglist_Condo_Shared_dataset_Jan_20.xlsx")

### Creating empty containers for sale listing url's

In [23]:
sale_url = []                 # List of all the sale Url scraped from the website
sale_listing_id = []          # List of all the craiglist listing id's scraped from each sale url
sale_price = []               # List of all the price information scraped from each sale url
sale_layout = []              # List of all the layout information scraped from each sale url
sale_bedroom = []             # List of all the bedroom information scraped from each sale url
sale_bathroom = []            # List of all the bathroom information scraped from each sale url
sale_sqft = []                # List of all the square feet information scraped from each sale url
sale_description = []         # List of all the description information scraped from each sale url
sale_latitude = []            # List of all the latitude information scraped from each sale url
sale_longitude = []           # List of all the longitude information scraped from each sale url
sale_map_text = []            # List of all the map text information scraped from each sale url
sale_google_map_url = []      # List of all the google map url information scraped from each sale url

### Web scraping all the sale listing url's from sale_listing

In [24]:
# Scraping the content of each url on the search page from the sale listing   
for saleurl in range(len(sale_listing)):
    information = listing_extractor(sale_listing[saleurl])
    sale_url.append(sale_listing[saleurl])
    
    # Extracting the craiglist listing ID of each individual listing
    try:
        posting_info = information.find("div", {"class":"postinginfos"})
        locate_id = posting_info.find("p",{"class":"postinginfo"})
        id_only = re.findall(r'\d+', locate_id.text)
        result = '%s' % "','".join(id_only)
        if result:
            sale_listing_id.append(result)
    except:
        no_id = 'Missing_Data' 
        sale_listing_id.append(no_id)
    
    # Extracting the price of each individual listing
    try:
        price_info = information.find("span", {"class":"price"}).string
        only_price = re.findall(r'\d+', price_info)
        if only_price:
            sale_price.append(only_price[0])
    except:
        no_price = 'Missing_Data'
        sale_price.append(no_price)
            
    # Extracting the layout of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        if only_layout:
            sale_layout.append(only_layout)
    except:
        no_layout = 'Missing_Data'
        sale_layout.append(no_layout)
    
    # Extracting the bedroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bedroom_info = regex_layout[0]
            sale_bedroom.append(bedroom_info)
    except:
        no_bedroom = 'Missing_Data'
        sale_bedroom.append(no_bedroom)
        
    # Extracting the bathroom of each individual listing
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        only_layout = layout_info[0].text
        regex_layout = re.findall(r'\d+', only_layout)
        if regex_layout:
            bathroom_info = regex_layout[1]
            sale_bathroom.append(bathroom_info)
    except:
        no_bathroom = 'Missing_Data'
        sale_bathroom.append(no_bathroom)
    
    # Extracting the square feet of each individual listing 
    try:
        layout_info = information.findAll("span", {"class":"shared-line-bubble"})
        squarefeet_info = layout_info[1].text
        regex_sqft = re.findall(r'\d{3,4}ft\d', squarefeet_info)
        if regex_sqft:
            only_sqft = re.findall(r'\d+', regex_sqft[0])
            if only_sqft:
                sale_sqft.append(only_sqft[0])
        else:
            try:
                description_information = information.find("section",
                                                           {"id" :"postingbody"}).findAll(text=True,
                                                                                          recursive=False)                          
                description_info = ''.join(description_information)
                initial_regex_search = re.findall(r'(\d{3,4}|\d+[\,]?\d+)[\s]?[S|s][q|f]', description_info)
                if initial_regex_search:
                    sale_sqft.append(initial_regex_search[0])
                else:
                    second_regex_search = re.findall(r'\w+[\s]?\w+[\s]?:[\s]?\d+[-]?\d+', description_info)
                    if second_regex_search:
                        regex_digits_only = re.findall(r'\d+[-]?\d+', second_regex_search)
                        if regex_digits_only:
                            sale_sqft.append(regex_digits_only[0])
                        else:
                            no_regex_digits = "Missing_Data"
                            sale_sqft.append(no_regex_digits)
                    else:
                        no_second_regex_search = "Missing_Data"
                        sale_sqft.append(no_second_regex_search)
            except:
                no_description = 'Missing_Data'
                sale_sqft.append(no_description)
    except:
        no_listing = 'Missing_Data'
        sale_sqft.append(no_listing)
        
    # Extracting the latitude of each individual listing
    try:
        location_info = information.find("div",{"class": "viewposting"})
        latitude_info = location_info.get("data-latitude")
        if latitude_info:
            sale_latitude.append(latitude_info)
    except:
        no_latitude_info = 'Missing_Data'
        sale_latitude.append(no_latitude_info)
                
    # Extracting the longitude of each individual listing    
    try:
        longitude_info = location_info.get("data-longitude")
        if longitude_info:
            sale_longitude.append(longitude_info)
    except:
        no_longitude_info = 'Missing_Data'
        sale_longitude.append(no_longitude_info)
            
    # Extracting the text below map of each individual listing
    try:
        map_text_info = information.find("div",{"class":"mapaddress"})
        only_map_text = map_text_info.text
        if only_map_text:
            sale_map_text.append(only_map_text)
    except:
        no_map_text = "Missing_Data"
        sale_map_text.append(no_map_text)
                
    # Extracting the text below map of each individual listing
    try:
        google_map_url_info = information.find("p",{"class":"mapaddress"})
        only_google_map_url = google_map_url_info.find("a").get("href")
        if only_google_map_url:
            sale_google_map_url.append(only_google_map_url)
    except:
        no_google_map_url = 'Missing_Data'
        sale_google_map_url.append(no_google_map_url)
    
    # Extracting the description of each individual listing 
    try:
        description_information = information.find("section",{"id" :"postingbody"}).findAll(text=True, recursive=False)                          
        description_info = ''.join(description_information)
        if description_info:
            sale_description.append(description_info)
    except:
        no_description = "Missing_Data"
        sale_description.append(no_description)
    
    sleep(2)

print('There were %s sale url\'s scraped.' % (len(sale_url)))
print('There were %s sale listing id\'s scraped.' % (len(sale_listing_id)))   
print('There were %s sale price scraped.' % (len(sale_price)))
print('There were %s sale layout scraped.' % (len(sale_layout)))
print('There were %s sale bedroom scraped.' % (len(sale_bedroom))) 
print('There were %s sale bathroom scraped.' % (len(sale_bathroom)))
print('There were %s sale square feet scraped.' % (len(sale_sqft)))
print('There were %s sale description scraped.' % (len(sale_description)))
print('There were %s sale latitude scraped.' % (len(sale_latitude)))
print('There were %s sale longitude scraped.' % (len(sale_longitude))) 
print('There were %s sale map text scraped.' % (len(sale_map_text))) 
print('There were %s sale google map url scraped.' % (len(sale_google_map_url))) 

There were 662 sale url's scraped.
There were 662 sale listing id's scraped.
There were 662 sale price scraped.
There were 662 sale layout scraped.
There were 662 sale bedroom scraped.
There were 662 sale bathroom scraped.
There were 662 sale square feet scraped.
There were 662 sale description scraped.
There were 662 sale latitude scraped.
There were 662 sale longitude scraped.
There were 662 sale map text scraped.
There were 662 sale google map url scraped.


### Creating the Dataframe using pandas library

In [25]:
craiglist_sale_dataset = pd.DataFrame(np.column_stack([sale_listing_id, sale_price, sale_layout, 
                                                         sale_bedroom, sale_bathroom, sale_sqft, 
                                                         sale_latitude, sale_longitude, sale_url, 
                                                         sale_map_text, sale_google_map_url, 
                                                         sale_description]),
                                        columns = ['sale_listing_id','sale_price', 'sale_layout',
                                                   'sale_bedroom', 'sale_bathroom', 'sale_sqft', 
                                                   'sale_latitude', 'sale_longitude', 'sale_url', 
                                                   'sale_map_text', 'sale_google_map_url', 'sale_description'])

In [26]:
craiglist_sale_dataset.head()

Unnamed: 0,sale_listing_id,sale_price,sale_layout,sale_bedroom,sale_bathroom,sale_sqft,sale_latitude,sale_longitude,sale_url,sale_map_text,sale_google_map_url,sale_description
0,6466167559,Missing_Data,2BR / 2Ba,2,2,Missing_Data,43.711164,-79.726024,https://toronto.craigslist.ca/bra/reo/d/condo-...,8 Lisa st.#701,https://maps.google.com/?q=loc%3A+%38+Lisa+st%...,\n\n2 bed 2 full bath 1 park and locker
1,6461524938,Missing_Data,1BR / 1Ba,1,1,500,43.642707,-79.382305,https://toronto.craigslist.ca/tor/reb/d/downto...,65 Bremner Blvd.,https://maps.google.com/?q=loc%3A+%36%35+Bremn...,\n\nDIRECTLY CONNECTED to the AIR CANADA CENTR...
2,6462079074,369000,1BR / 1Ba,1,1,Missing_Data,43.613579,-79.560476,https://toronto.craigslist.ca/tor/reb/d/high-p...,105 The Queensway,https://maps.google.com/?q=loc%3A+%31%30%35+Th...,\n\nGreat High Park First Time Buyer or Invest...
3,6462122777,339900,2BR / 2Ba,2,2,1150,43.697993,-79.511595,https://toronto.craigslist.ca/tor/reb/d/beauti...,3 Hickory Tree Rd.,https://maps.google.com/?q=loc%3A+%33+Hickory+...,\n\nGorgeous Spacious Condo Suitable for Downs...
4,6462900930,335000,1BR / 1Ba,1,1,720,43.582317,-79.62092,https://toronto.craigslist.ca/mss/reb/d/missis...,115 Hillcrest Ave.,https://maps.google.com/?q=loc%3A+%31%31%35+Hi...,\n\nMississauga Great Investment Condo \nOne ...


### Exporting the DataFrame into Excel file to store the results

In [27]:
craiglist_sale_dataset.to_excel("Final_Toronto_Craiglist_Condo_Sale_dataset_Jan_20.xlsx")