In [31]:
# Project:     UFC Event Scraper 
# Author:      Will Carpenter
# Date:        Feb 4th, 2022 

# Importing 
import requests
import csv 
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from google.colab import files
import datetime
from datetime import date
from pytz import timezone
eastern = timezone('US/Eastern')
import threading 
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
cores = multiprocessing.cpu_count()

In [37]:
# Scrape all UFC event urls from the landing page

start_time = time.time()

events_upcoming  = 'http://ufcstats.com/statistics/events/upcoming?page=all'
events_completed = 'http://ufcstats.com/statistics/events/completed?page=all'

# get all event details from both those pages and add to a big list 
# loop through the big list and retrieve relavent information (date, location, fightsListed)

event_string = 'http://ufcstats.com/event-details/'

completed_page       = requests.get(events_completed)
completed_soup       = BeautifulSoup(completed_page.text, 'html.parser')
all_completed_events = completed_soup.find_all('a')

upcoming_page        = requests.get(events_upcoming)
upcoming_soup        = BeautifulSoup(upcoming_page.text, 'html.parser')
all_upcoming_events  = upcoming_soup.find_all('a')

ufcEventsLinks = []

# completed events page list 
for event in all_completed_events:
  link = str(event.get('href'))
  if event_string in link: 
    ufcEventsLinks.append(link) 
    # print(link)

# upcoming events page links 
for event in all_upcoming_events:
  link = str(event.get('href'))
  if event_string in link:
    ufcEventsLinks.append(link)
    # rint(link)

# ~1 seconds runtime
print('Time to execute: ', "{:0.5f}".format(time.time() - start_time))

# ufcEventsLinks


Time to execute:  1.01023


In [38]:
# ufcEvents = []

def concurrent_ufc_events(ufcEventsLinks, increments):
  start_time = time.time()
  ufcEvents = dict()

  with ThreadPoolExecutor(max_workers = increments) as executor:
    for eventLink in ufcEventsLinks:
      try:
      # get the data and add to a master data file
        ufcEvents[eventLink] = executor.submit(get_event_data, eventLink)
        # ufcEvents.append(eventData)
      except:
        print("Error on link: ", eventLink)  
  
  completion_time = time.time() - start_time
  print("Time to complete: ", completion_time)
  to_return = {eventLink : future.result() for eventLink, future in ufcEvents.items()}
  return to_return

def get_event_data(eventLink):
  eventRow = []
  title           = "---"
  location        = "---"
  date            = "---"
  listedFights    = 0
  #if eventCount % 10 == 0: 
  #  print('Events parsed: ', eventCount)
  #eventCount +=1 
  
  event_page = requests.get(eventLink)
  event_soup = BeautifulSoup(event_page.text, 'html.parser')
  event_details = event_soup.find_all('li', {'class' : 'b-list__box-list-item'})
  event_title   = event_soup.find_all('h2', {'class' : 'b-content__title' })
  event_fights  = event_soup.find_all('a')

  for text in event_title:
    title = text.text.strip()

  for detail in event_details:
    detail_text = detail.text.strip()
    if 'Date:' in detail_text: 
          date = detail_text.replace('Date:', '').strip()
    if 'Location' in detail_text: 
          location = detail_text.replace('Location:', '').strip()

  # count fights in every completed event 
  fight_string = 'http://ufcstats.com/fight-details/'
  for fight in event_fights:
    if fight_string in str(fight.get('href')):
        listedFights += 1

  # print(eventLink)
  # print(title)
  # print(date)
  # print(location)
  # print(listedFights)    
 
  eventRow.append(eventLink)
  eventRow.append(title)
  eventRow.append(date)
  eventRow.append(location)
  eventRow.append(listedFights)

  return eventRow

In [None]:
events = concurrent_ufc_events(ufcEventsLinks, 8) # use 8 workers for ~45 second runtime
ufcEvents = []

for url in events:
  row = events[url]
  ufcEvents.append(row)  

In [None]:
# Create and clean dataset 

today = datetime.date.today()
print("Today's date:", today)

df = pd.DataFrame(ufcEvents, columns=['eventLink', 'eventName', 'eventDateString', 'eventLocation', \
                                     'eventListedFights'])
# Clean/engineer dataset 
df['eventDateString'] = df['eventDateString'].replace(',', '', regex=True)
df['date'] = pd.to_datetime(df['eventDateString'], format="%B %d %Y")
df['date'] = df['date'].dt.date
df = df.sort_values(by=['date'], ascending=False)
df['upcoming'] = np.where(df['date']>today,1,0)
df # show the final dataset 

# df['date'] = df['eventDateString'].datetime.strptime(df['eventDateString'], "%B %d %Y")



In [42]:
# Export to Drive and Download
df.to_excel('/content/drive/MyDrive/UFC Model/ufcEventsList.xlsx')
df.to_csv('/content/drive/MyDrive/UFC Model/ufcEventsList.csv')
files.download('/content/drive/MyDrive/UFC Model/ufcEventsList.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>