# Imports

In [64]:
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from selenium.common.exceptions import NoSuchElementException

# Data Scraping

We need to scrape workout data from WODWELL.com. This is a lot of pages to scrape so we need to automate this process rather than manually input every workout name. We use selenium to automate the process of data scraping.

In [69]:
#create the driver and store it in cache
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://wodwell.com/wods')

#click the show all button so infinte scroll is on
driver.find_element(By.XPATH, '/html/body/div[1]/div[6]/div[6]/div/div/div/div/button').send_keys("\n")

#Inifinte scroll to load wods
SCROLL_PAUSE_TIME = 10

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

for i in range(300):
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

wod_names=[]
wod_text=[]

workouts = driver.find_elements(By.XPATH, '//div[@class="wod-description"]')
for wod in workouts:
    try:
        wod_names.append(wod.find_element(By.TAG_NAME, 'h1').text)
        wod_text.append(wod.text)
    except NoSuchElementException:
        print('non-wod element')
driver.close()

print(len(wod_names))
print(len(wod_text))




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/jake/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


non-wod element
non-wod element
non-wod element
4116
4116


After scraping 4116 workouts from WODWELL, we save them into a dataframe of workout name paired with workout text. There is still plenty of data cleaning to be done and potentially scraping for likes and comments but the initial data gathering is done.

In [70]:
df = pd.DataFrame(list(zip(wod_names,wod_text)),columns = ['Name','Text'])
print(df.head())

           Name                                               Text
0         MURPH  MURPH\nCrossFit Hero WOD\nFor Time\n1 mile Run...
1            DT  DT\nCrossFit Hero WOD\n5 Rounds For Time\n12 D...
2         CINDY  CINDY\nCrossFit Benchmark "Girl" WOD\nAMRAP in...
3  BEAR COMPLEX  BEAR COMPLEX\nCrossFit Benchmark WOD\n5 Rounds...
4          FRAN  FRAN\nCrossFit "Girl" Benchmark WOD\n21-15-9 R...


In [81]:
#Save the dataframe as a CSV so we don't have to scrape the workouts again.
datapath = '../data/raw/namesAndText.csv'
df.to_csv(datapath, index=False)

In [80]:
df.describe()

Unnamed: 0,Name,Text
count,4116,4116
unique,3895,4008
top,JUSTIN,EDWARD WHITE\n555 Fitness Hero WOD\nEMOM for 2...
freq,4,2
