# Download Articles
We want to build a dataset of ultimate-related articles so we will scrape **[Ultiworld](https://ultiworld.com/)** for articles about ultimate.



In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import json
from bs4 import BeautifulSoup
import re
import numpy as np
import os

# load login info
with open("secrets.json") as f:
    secrets = json.load(f)
secrets.keys()

dict_keys(['ultiworld'])

In [2]:
class Browser():
    def __init__(self, service, options):
        self.browser = webdriver.Chrome(service=service, options=options)
        self.short_delay = 1
        self.long_delay = 3
    
    def open_page(self, url: str):
        self.browser.get(url)
        time.sleep(self.long_delay)

    def close_browser(self):
        self.browser.close()

    def add_input(self, by: By, value: str, text: str):
        field = self.browser.find_element(by=by, value=value)
        field.send_keys(text)
        time.sleep(self.short_delay)

    def click_button(self, by: By, value: str):
        button = self.browser.find_element(by=by, value=value)
        button.click()
        time.sleep(self.short_delay)

    def login_ultiworld(self, username: str, password:str):
        self.add_input(by = By.ID, value = "user_login", text=username)
        self.add_input(by = By.ID, value = "user_pass", text=password)
        self.click_button(by = By.ID, value = "wp-submit")

    def get_page_source(self):
        return self.browser.page_source
        
    def open_new_tab(self, url: str):
        self.browser.execute_script(f"window.open('{url}');")
        time.sleep(self.long_delay)

    def go_to_tab(self, tab_index: int):
        self.browser.switch_to.window(window_name = self.browser.window_handles[tab_index])
    

In [3]:
def extract_text(page_source):
    soup = BeautifulSoup(page_source, "html.parser")
    paragraphs = soup.find_all("p")

    clean_paragraphs = []
    for paragraph in paragraphs:
        if 'class=' not in str(paragraph):
            clean_paragraphs.append(paragraph.text)
        elif "TAGGED" in paragraph:
            break
    text = "\n".join(clean_paragraphs)

    replacement_pairs = [('“', '"'), ('”', '"'), ("’", "'")]
    for old, new in replacement_pairs:
        text = text.replace(old, new)
    return text

In [8]:
url = "https://ultiworld.com/wordpress/wp-login.php?redirect_to=https%3A%2F%2Fultiworld.com%2F"
search_terms = ["strategy", "analysis", "throwing", "cutting", "handling"]
article_folder = "articles"
max_articles_per_search = 200

for search_term in search_terms:

    items = []

    # Instantiate browser object
    browser = Browser(service = Service(), options = webdriver.ChromeOptions())

    # Open browser and log in to ultiworld
    browser.open_page(url)
    browser.login_ultiworld(secrets["ultiworld"]["username"], secrets["ultiworld"]["password"])

    # search for articles on `search_term`
    browser.click_button(by = By.ID, value = "search-button")
    browser.add_input(by = By.ID, value = "unified-input", text = search_term+Keys.ENTER)
    browser.click_button(by = By.CLASS_NAME, value = "ais-Panel-headerButton")
    browser.click_button(by = By.CLASS_NAME, value = "ais-RefinementList-checkbox")
    time.sleep(5)

    while len(items) < max_articles_per_search:
        time.sleep(np.random.randint(3,6))  # delay to allow page to load
        
        # get articles and their links on page
        page_source = browser.get_page_source()
        soup = BeautifulSoup(page_source, "html.parser")
        article_links = soup.find_all("a", class_="uni-Hit-inner")

        page_items = []
        for article_link in article_links:
            item = {
                "date": article_link.find("span", class_="uni-Hit-date").text,
                "title": article_link.find("span", class_="uni-Hit-title").text,
                "url": article_link.get("href")
            }
            page_items.append(item)
        
        for item in page_items:
            # check if article is in folder already and skip if present
            article_filename = re.sub(r'[^\w\s]', '', item['title']).lower().replace(" ","-") + ".txt"
            print(article_filename, end="\r")
            if article_filename in os.listdir(article_folder):
                continue

            # Open link in new tab, move into new tab
            browser.open_new_tab(item['url'])
            browser.go_to_tab(tab_index = -1)

            # extract article text and save to disk
            page_source = browser.get_page_source()
            text = extract_text(page_source)
            
            with open(f"{article_folder}/{article_filename}", "w") as f:
                f.write(text)

            # Close current tab and return to original tab
            browser.close_browser()
            browser.go_to_tab(tab_index = 0)

            items.extend(item)

        try:
            # go to next search results page
            browser.click_button(by = By.XPATH, value = '//*[@title="Next page"]')
        except:
            print("We have reached the end of the articles")
            break
    
    # Close browser
    browser.go_to_tab(tab_index = 0)
    browser.close_browser()

    print(f"{len(items)} {search_term} articles downloaded")

In [10]:
# browser.go_to_tab(tab_index = 0)
# browser.close_browser()

### Cleaning up files even more
It seems that every article still has some comments (often identical) after the `TAGGED:` entry. So let's iterate through the files and remove these.

In [11]:
files = os.listdir(article_folder)
files.sort()
for file in files:
    with open(f"{article_folder}/{file}", "r") as f:
        text = f.read()

    # cut out comments at end
    text = text.split("TAGGED:")[0]
    text = text.split("Baylor also went 4-0 ")[0]
    with open(f"{article_folder}/{file}", "w") as f:
        f.write(text)


In [12]:
files = os.listdir(article_folder)
len(files)

568