# Collect Incident Links

In [1]:
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
import time

url = "https://www.torontomu.ca/community-safety-security/security-incidents/list-of-security-incidents/"
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url) 

page_num = 1
incident_links = []

while True:
    time.sleep(1)
    
    links = driver.find_elements(By.CLASS_NAME, 'title')
    for link in links:
        href = link.get_attribute('href')
        incident_links.append(href)
        
    print(f"Page: {page_num}, Page Links: {len(links)}, Total Links: {len(incident_links)}")
    page_num += 1
    
    try:
        next_page = WebDriverWait(driver, 20).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, '[rel="next"]'))
        )
    except TimeoutException:
        break
        
    if 'disabled' in next_page.get_attribute('class'):
        break 
    
    next_page.click()
    
    try:
        WebDriverWait(driver, 20).until(EC.staleness_of(next_page))
    except TimeoutException:
        break  

driver.quit()

Page: 1, Page Links: 10, Total Links: 10
Page: 2, Page Links: 10, Total Links: 20
Page: 3, Page Links: 10, Total Links: 30
Page: 4, Page Links: 10, Total Links: 40
Page: 5, Page Links: 10, Total Links: 50
Page: 6, Page Links: 10, Total Links: 60
Page: 7, Page Links: 10, Total Links: 70
Page: 8, Page Links: 10, Total Links: 80
Page: 9, Page Links: 10, Total Links: 90
Page: 10, Page Links: 10, Total Links: 100
Page: 11, Page Links: 10, Total Links: 110
Page: 12, Page Links: 10, Total Links: 120
Page: 13, Page Links: 10, Total Links: 130
Page: 14, Page Links: 10, Total Links: 140
Page: 15, Page Links: 10, Total Links: 150
Page: 16, Page Links: 10, Total Links: 160
Page: 17, Page Links: 10, Total Links: 170
Page: 18, Page Links: 10, Total Links: 180
Page: 19, Page Links: 10, Total Links: 190
Page: 20, Page Links: 10, Total Links: 200
Page: 21, Page Links: 10, Total Links: 210
Page: 22, Page Links: 10, Total Links: 220
Page: 23, Page Links: 10, Total Links: 230
Page: 24, Page Links: 10, Tot

# Scrape Data From Incidents

In [2]:
import requests
from bs4 import BeautifulSoup

def scrape_incident(incident_url):
    response = requests.get(incident_url)
        
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        title_e = soup.find('h1', class_='title')
        title = title_e.text.strip() if title_e else None
        
        date_e = soup.find('div', class_='date')
        date = date_e.text.strip() if date_e else None
        
        location_e = soup.find('div', class_='location')
        location = location_e.text.split('Location of incident')[1].strip() if location_e else None

        doi_e = soup.find('div', class_='dateofincident')
        date_of_incident = doi_e.text.split('Date and time of incident')[1].strip() if doi_e else None

        dr_e = soup.find('div', class_='datereported')
        date_reported = dr_e.text.split('Date and time reported')[1].strip() if dr_e else None

        id_e = soup.find('div', class_='incidentdetails')
        incident_details = id_e.find('p').text.strip() if id_e else None

        incident_info = [incident_url, title, date, location, date_of_incident, date_reported, incident_details]

        description_e = soup.find('div', class_='description')
        if description_e:
            descriptions = []
            for i, paragraph in enumerate(description_e.find_all('p')):
                descriptions.append([incident_url, i+1, paragraph.text.strip()])
        else:
            descriptions = [[incident_url, 1, None]]
        
        response.close()
        
        return incident_info, descriptions
    
    else:
        print(f"Failed: {response.status_code} - {incident_url}")
        response.close()
        
        return None, None

In [3]:
from tqdm.notebook import tqdm
import pandas as pd

ii_data = []
desc_data = []

for url in tqdm(incident_links):
    incident_info, descriptions = scrape_incident(url)
    
    if incident_info and descriptions:
        ii_data.append(incident_info)
        
        for desc in descriptions:
            desc_data.append(desc)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=796.0), HTML(value='')))




In [4]:
cols = ['incident_url', 'title', 'date', 'location', 'date_of_incident', 'date_reported', 'incident_details']
incident_info_df = pd.DataFrame(ii_data, columns=cols)

cols = ['incident_url', 'paragraph_num', 'description']
descriptions_df = pd.DataFrame(desc_data, columns=cols)

incident_info_df.to_csv('incident_info.csv', index=False)
descriptions_df.to_csv('descriptions.csv', index=False)