In [57]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Initialize the WebDriver
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)  # This option keeps the browser open
driver = webdriver.Chrome(options=options)

# Open the URL
url = "https://cmrao.ca/consumer-protection/public-registry#company___ltd"
driver.get(url)

# Close popup if it appears
try:
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "ctct-popup-close"))
    ).click()
except:
    print("No popup appeared")

# Wait for the page to load
time.sleep(5)

# Enter search text
search_input = driver.find_element(By.ID, "search")
search_input.clear()  # Clear any existing text in the search input
search_input.send_keys("ltd")

# Select category
category_select = driver.find_element(By.ID, "category")
for option in category_select.find_elements(By.TAG_NAME, "option"):
    if option.text == "Condominium Management Provider (Company)":
        option.click()
        break

# Wait for manual CAPTCHA solving
print("Please solve the CAPTCHA manually...")
WebDriverWait(driver, 120).until(
    EC.presence_of_element_located((By.CLASS_NAME, "g-recaptcha-response"))
)

# Click the search button using the class
search_button = driver.find_element(By.CLASS_NAME, "js-to-search-registry")
search_button.click()

# Wait for 5 seconds after clicking the search button
time.sleep(5)

# Function to scrape data from the current page
def scrape_data_from_page():
    data = []
    rows = driver.find_elements(By.CSS_SELECTOR, ".individual-results__page.js-results-page.active tbody tr")
    for row in rows[1:]:  # Skip the header row
        columns = row.find_elements(By.TAG_NAME, "td")
        if columns:
            view_link = columns[-1].find_element(By.CLASS_NAME, "btn").get_attribute("href")
            data.append({
                "Legal Name": columns[0].text,
                "Licensed As": columns[1].text,
                "Status": columns[2].text,
                "Licence Number": columns[3].text,
                "Licence Type": columns[4].text,
                "View Link": view_link
            })
    return data

# Initialize an empty list to store all data
all_data = []

# Loop through all pages and scrape data
page = 1
while True:
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".individual-results__page.js-results-page.active tbody tr"))
        )
        data = scrape_data_from_page()
        all_data.extend(data)
        print(f"Scraped data from page {page}")

        # Try to find the next page button
        next_page_button = driver.find_element(By.CSS_SELECTOR, f"a[data-page='{page + 1}']")
        next_page_button.click()
        page += 1
        # Wait for the next page to load
        time.sleep(10)  # Increased wait time for next page
    except:
        print("No more pages to scrape or an error occurred.")
        break

# Close the WebDriver
driver.quit()

# Save the data to a CSV file
df_data = pd.DataFrame(all_data)
csv_file_path = "Condo_Management_Data2.csv"
df_data.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")


Please solve the CAPTCHA manually...
Scraped data from page 1
Scraped data from page 2
Scraped data from page 3
Scraped data from page 4
Scraped data from page 5
Scraped data from page 6
Scraped data from page 7
Scraped data from page 8
Scraped data from page 9
No more pages to scrape or an error occurred.
Data saved to Condo_Management_Data2.csv


In [59]:
import pandas as pd

# Load the CSV file into a DataFrame
csv_file_path = "Condo_Management_Data2.csv"
df = pd.read_csv(csv_file_path)

# Filter rows with "Active" status
active_rows = df[df['Status'] == 'Active']

# Save the filtered rows to a new CSV file
active_csv_file_path = "Active_Condo_Management_Data2.csv"
active_rows.to_csv(active_csv_file_path, index=False)

print(f"Active status data saved to {active_csv_file_path}")


Active status data saved to Active_Condo_Management_Data2.csv


In [61]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Initialize the WebDriver
driver = webdriver.Chrome()  # You can replace Chrome with Firefox or any other browser

# Load the CSV file with URLs
df = pd.read_csv('active_condo_management_data2.csv')

# Initialize a list to store the scraped data
scraped_data = []

# Iterate over each URL in the CSV
for index, row in df.iterrows():
    url = row['View Link']
    driver.get(url)
    
    # Wait for the page to load
    time.sleep(5)  # Adjust the sleep time as needed

    # Scrape the required information
    try:
        principal_manager = driver.find_element(By.XPATH, "//div[@class='individual-record__info']//h5[text()='Principal Condominium Manager']/following-sibling::p/a").text
    except:
        principal_manager = 'N/A'
    
    try:
        business_address = driver.find_element(By.XPATH, "//div[@class='individual-record__info']//h5[text()='Business Address']/following-sibling::p").text
    except:
        business_address = 'N/A'
    
    try:
        business_email = driver.find_element(By.XPATH, "//div[@class='individual-record__info']//h5[text()='Business Email Address']/following-sibling::a").text
    except:
        business_email = 'N/A'
    
    try:
        business_phone = driver.find_element(By.XPATH, "//div[@class='individual-record__info']//h5[text()='Business Phone Number']/following-sibling::a").text
    except:
        business_phone = 'N/A'

    # Append the scraped data to the list
    scraped_data.append({
        'Licensed As': row['Licensed As'],
        'Licence Number': row['Licence Number'],
        'Principal Condominium Manager': principal_manager,
        'Business Address': business_address,
        'Business Email Address': business_email,
        'Business Phone Number': business_phone,
        'View Link': url
    })

# Close the WebDriver
driver.quit()

# Convert the list to a DataFrame
scraped_df = pd.DataFrame(scraped_data)

# Save the DataFrame to a new CSV file
scraped_df.to_csv('scraped_condo_management_data2.csv', index=False)

print('Scraping complete. Data saved to scraped_condo_management_data.csv')


Scraping complete. Data saved to scraped_condo_management_data.csv


In [63]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("Active_condo_management_contact.csv")

# Sort the DataFrame based on the "Licensed As" column
df_sorted = df.sort_values(by="Licensed As")

# Save the sorted DataFrame to a new CSV file
df_sorted.to_csv("sorted_condo_management_data.csv", index=False)

print("The rows have been sorted alphabetically based on the 'Licensed As' column.")


The rows have been sorted alphabetically based on the 'Licensed As' column.
