#### Code to scrape data from the Washington Post's Fact Check database

Script uses Selenium for interactive features on the page and BeautifulSoup for scraping the necessary data.

In [None]:
import pandas as pd 
from random import randint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import re

option = webdriver.ChromeOptions()
option.add_argument('-incognito')
chromedriver = '/Users/vchau76/Desktop/Graduate School/FSB/STAT5006/Final Project/chromedriver' 
driver = webdriver.Chrome(executable_path = chromedriver, options=option)

In [None]:
# Open URL of page to scrape

url = 'https://www.washingtonpost.com/graphics/politics/trump-claims-database/'
driver.get(url)

In [None]:
# Use Selenium for clicking button to load dynamic content

from IPython.core.display import clear_output
from time import sleep,time
timestart_time = time()

# Loop through and click 'Load more claims' button using selenium - loads 50 new claims each time (total of 13435 claims)

requests = 0
while True:
    try:
        driver.find_element_by_css_selector("button.pg-button").click()
        
        requests += 1
        # Set random time to wait before clicking button again
        sleep(randint(5,15))
        current_time = time()
        elapsed_time = current_time - start_time
        
        # print out each request and frequency
        print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)
        
        button = driver.find_element_by_css_selector("button.pg-button").text
        if 'Load more claims' not in button:
            print("There are no more claims.")
            break
    except NoSuchElementException as error:
        print(error)
        break

In [None]:
# Use BeautifulSoup for webscraping data

html_soup = BeautifulSoup(driver.page_source, 'lxml')

# List to append all data values
lies = []

# Container for each lie with all associated data values we are trying to scrape
claims_container = html_soup.find_all('div', class_ ='claim-row')

# Loop through each container to grab data values for each lie
for container in claims_container:

    dates_elem = container.find('span',class_='label').text # date of lie
    
    analysis_elem = container.find('div',class_='analysis').find('p',class_='pg-bodyCopy').text # Washington Post analysis
    fc_rating_count = container.find_all('span',{'class': 'pinocchio'}) # count number of pinocchios
    fc_rating_elem = len(fc_rating_count)

    # flags for IF statements
    repeated_elem_flag = container.find('span', class_='repeated-total') # flag to determine if/when lie was repeated
    repeated_dates_flag = container.find('div', class_='repeats') # flag to determine if lies are repeated
    no_repeat_flag = container.find('div',class_="details not-expanded") # flag if no repeated dates 
    lies_elem_flag = container.find('p', class_='pg-bodyCopy has-apos') # lie
 
    if lies_elem_flag:
        lies_elem = container.find('p', class_='pg-bodyCopy has-apos').text.strip('“”') # lie
        
    #checks for repeated instances of lie
    if repeated_elem_flag: 
        repeated_elem = container.find('span', class_='underline--green').text.rstrip('times').strip() # number of times lies repeated
    else:
        repeated_elem = 0
        
    if repeated_dates_flag:
        rp_dates = container.find_all('span','repeat pg-highlight')
        repeated_dates = [dates.text for dates in rp_dates]
        repeated_dates = ', '.join(repeated_dates)
    if no_repeat_flag:
        topic_elem = no_repeat_flag.select_one('p:nth-of-type(1)').text.lstrip('Topic:').strip()
        source_elem = no_repeat_flag.select_one('p:nth-of-type(2)').text.lstrip('Source:').strip()
    else:
        lies_elm = None
        repeated_dates = 0
         
    new = ((dates_elem,int(repeated_elem),repeated_dates,topic_elem,source_elem,lies_elem,analysis_elem,fc_rating_elem))
    lies.append(new)
    
             
df = pd.DataFrame(lies, columns=['date','times repeated','dates repeated','topic','source','lies','analysis','fact check rating'])

In [None]:
df

In [None]:
df['topic'].value_counts()

In [None]:
df['source'].value_counts()

In [None]:
# Convert date to datetime format
df['date'] = pd.to_datetime(df['date'], format='%b %d %Y')

In [None]:
# Export to CSV file
df_export = df.to_csv('DT_lies.csv',index=False)

In [None]:
# End the Selenium browser session
driver.close()