In [55]:
# web scrape from yellowpage
from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib.parse

# Base URL for the Yellow Pages search
base_url = 'https://www.yellowpages.ca/search/si'

# Initialize lists to store the extracted data
company_names = []
addresses = []
phone_numbers = []
website_urls = []
company_titles = []

# Function to parse a single page
def parse_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract company names
    for listing in soup.find_all('div', class_='listing__content__wrap--flexed'):
        name_tag = listing.find('a', class_='listing__name--link listing__link jsListingName')
        address_tag = listing.find('span', class_='listing__address--full')
        phone_tag = listing.find('a', class_='mlr__item__cta jsMlrMenu')
        website_tag = listing.find('li', class_='mlr__item mlr__item--website')
        title_tag = listing.find('div', class_='listing__headings')

        # Extract and append company name
        if name_tag:
            company_names.append(name_tag.get_text(strip=True))
        else:
            company_names.append('N/A')

        # Extract and append address
        if address_tag:
            addresses.append(address_tag.get_text(strip=True))
        else:
            addresses.append('N/A')

        # Extract and append phone number
        if phone_tag and 'data-phone' in phone_tag.attrs:
            phone_numbers.append(phone_tag['data-phone'])
        else:
            phone_numbers.append('N/A')

        # Extract and decode website URL
        if website_tag:
            a_tag = website_tag.find('a', class_='mlr__item__cta')
            if a_tag and 'href' in a_tag.attrs:
                parsed_url = urllib.parse.urlparse(a_tag['href'])
                query_params = urllib.parse.parse_qs(parsed_url.query)
                redirect_url = query_params.get('redirect', [''])[0]
                decoded_url = urllib.parse.unquote(redirect_url)
                website_urls.append(decoded_url)
            else:
                website_urls.append('N/A')
        else:
            website_urls.append('N/A')

        # Extract and append company title
        if title_tag:
            company_titles.append(title_tag.get_text(strip=True))
        else:
            company_titles.append('N/A')

# Loop through multiple pages
for page_num in range(1, 67):  # Assuming there are 66 pages
    url = f'{base_url}/{page_num}/Property-Management/Toronto+ON'
    try:
        parse_page(url)
        print(f"Processed page {page_num}")
    except Exception as e:
        print(f"Failed to process page {page_num}: {e}")
        break

# Debugging output to check list lengths
print(f"Company Names: {len(company_names)}")
print(f"Addresses: {len(addresses)}")
print(f"Phone Numbers: {len(phone_numbers)}")
print(f"Website URLs: {len(website_urls)}")
print(f"Company Titles: {len(company_titles)}")

# Create a DataFrame to store the extracted data
data = {
    'Company Name': company_names,
    'Address': addresses,
    'Phone Number': phone_numbers,
    'Website URL': website_urls,
    'Company Title': company_titles
}
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
output_file = 'property_management_companies.xlsx'
df.to_excel(output_file, index=False)
print(f"Data saved to '{output_file}'")


Processed page 1
Processed page 2
Processed page 3
Processed page 4
Processed page 5
Processed page 6
Processed page 7
Processed page 8
Processed page 9
Processed page 10
Processed page 11
Processed page 12
Processed page 13
Processed page 14
Processed page 15
Processed page 16
Processed page 17
Processed page 18
Processed page 19
Processed page 20
Processed page 21
Processed page 22
Processed page 23
Processed page 24
Processed page 25
Processed page 26
Processed page 27
Processed page 28
Processed page 29
Processed page 30
Processed page 31
Processed page 32
Processed page 33
Processed page 34
Processed page 35
Processed page 36
Processed page 37
Processed page 38
Processed page 39
Processed page 40
Processed page 41
Processed page 42
Processed page 43
Processed page 44
Processed page 45
Processed page 46
Processed page 47
Processed page 48
Processed page 49
Processed page 50
Processed page 51
Processed page 52
Processed page 53
Processed page 54
Processed page 55
Processed page 56
P

In [5]:
#check file loc
import os
print(os.getcwd())


C:\Users\tangn\python data


In [7]:
#list file in loc
os.listdir()


['.ipynb_checkpoints',
 'property_management_companies.csv',
 'property_management_companies.xlsx',
 'property_management_companies_updated.xlsx',
 'scraper yellow page.ipynb',
 'yellowpages.html']

In [13]:
#improt excel file and print coloumn
import pandas as pd

# Load the file
file_path = 'property_management_companies.xlsx'
df = pd.read_excel(file_path)

# Print the column names to verify
print(df.columns)


Index(['Company Name', 'Address', 'Phone Number', 'Website URL',
       'Company Title'],
      dtype='object')


In [15]:
# find no duplicate rows
# actually no longer needed
df_cleaned = df.drop_duplicates(subset=['Company Name', 'Address', 'Phone Number', 'Website URL',
       'Company Title'], keep='first')

# Save the cleaned DataFrame to a new file
cleaned_file_path = 'property_management_companies_cleaned.xlsx'
df_cleaned.to_excel(cleaned_file_path, index=False)

print(f"Cleaned data saved to {cleaned_file_path}")


Cleaned data saved to property_management_companies_cleaned.xlsx


In [17]:
# find unique website links
import pandas as pd

# Load the file
file_path = 'property_management_companies.xlsx'
df = pd.read_excel(file_path)

# Print the column names to verify
print(df.columns)

# Assuming the column is named 'Website URL' or 'Web'
# Adjust the column name based on your actual data
unique_web_df = df.drop_duplicates(subset=['Website URL'], keep='first')

# Save the DataFrame with unique website URLs to a new file
unique_web_file_path = 'property_management_companies_unique_web.xlsx'
unique_web_df.to_excel(unique_web_file_path, index=False)

print(f"Data with unique website URLs saved to {unique_web_file_path}")


Index(['Company Name', 'Address', 'Phone Number', 'Website URL',
       'Company Title'],
      dtype='object')
Data with unique website URLs saved to property_management_companies_unique_web.xlsx


In [23]:
# find unique websites and no more facebook pages anymore
import pandas as pd

# Load the file
file_path = 'property_management_companies_unique_web.xlsx'
df = pd.read_excel(file_path)

# Drop rows where the 'Website' column contains 'facebook'
filtered_df = df[~df['Website URL'].str.contains('facebook', case=False, na=False)]

# Save the filtered DataFrame to a new file
filtered_file_path = 'property_management_companies_no_facebook.xlsx'
filtered_df.to_excel(filtered_file_path, index=False)

print(f"Data without 'facebook' URLs saved to {filtered_file_path}")



Data without 'facebook' URLs saved to property_management_companies_no_facebook.xlsx


In [27]:
# find domain and no www.
import pandas as pd
from urllib.parse import urlparse

# Function to extract and clean the domain name from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    # Remove 'www.' if present
    if domain.startswith('www.'):
        domain = domain[4:]
    return domain if domain else url

# Load the Excel file
file_path = 'property_management_companies_all_cleaned_407.xlsx'
df = pd.read_excel(file_path)

# Clean and extract domain names
df['Domain'] = df['Website URL'].apply(extract_domain)

# Save the cleaned dataframe to a new file
cleaned_file_path = 'property_management_companies_domains.xlsx'
df.to_excel(cleaned_file_path, index=False)

print(f"Data saved to '{cleaned_file_path}'")


Data saved to 'property_management_companies_domains.xlsx'


In [33]:
# cleanup domains now unique domain names only
import pandas as pd
from urllib.parse import urlparse

# Function to extract and clean the domain name from URL
def extract_domain(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    # Remove 'www.' if present
    if domain.startswith('www.'):
        domain = domain[4:]
    return domain if domain else url

# Load the Excel file
file_path = 'property_management_companies_domains.xlsx'
df = pd.read_excel(file_path)

# Clean and extract domain names
#df['Domain'] = df['Website URL'].apply(extract_domain)

# Remove duplicate rows based on the Domain column
df_unique = df.drop_duplicates(subset=['Domain'])

# Save the cleaned dataframe to a new file
cleaned_file_path = 'property_management_companies_unique_domains.xlsx'
df_unique.to_excel(cleaned_file_path, index=False)

print(f"Data saved to '{cleaned_file_path}'")


Data saved to 'property_management_companies_unique_domains.xlsx'
