In [None]:
import urllib.request
from bs4 import BeautifulSoup
import math
import matplotlib.pyplot as plt
from selenium import webdriver
import time
import numpy as np
from scipy.interpolate import splrep, splev

In [None]:
dates = [
    "2018/03/15",
    "2018/03/16",
    "2018/03/17",
    "2018/03/18",
    "2018/03/22",
    "2018/03/23",
    "2018/03/24",
    "2018/03/25"
    #"2018/03/31",
    #"2018/04/02"
]

In [None]:
# Generates scoreboard url

def get_sb_url(date):
    return "https://www.ncaa.com/scoreboard/basketball-men/d1/" + date
    

In [None]:
# Produces urls for play-by-play pages

# retrieves html from generated requested_url
def retrieve_html(requested_url):
    req = urllib.request.Request(requested_url, headers={'User-Agent':'Mozilla/5.0'})
    response = urllib.request.urlopen(req)
    html = response.read()
    return html

# creates soup from html
def create_soup(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup

# returns list of links to play-by-play from soup of scoreboard page
def extract_play_links(date, filter_mm):
    sbpage_soup = create_soup(retrieve_html(get_sb_url(date)))
    sb = sbpage_soup.find(id='scoreboard')
    links = []
    
    # returns only march madness tournament games
    if filter_mm:
        for game in sb.find_all(class_='game-contents'):
            if game.contents[3].string != None:
                links.append('https://www.ncaa.com' + game.find(class_='gamecenter')['href'] + '/play-by-play')
                #print(game.contents[7].contents[1].contents[1].contents[0]['href'])
    else:
        link_tags = sb.find_all(class_='gamecenter')
        for tag in link_tags:
            links.append('https://www.ncaa.com' + tag['href'] + '/play-by-play')
    
    return links

#print(extract_play_links(dates[0],True))

In [None]:
# Handles time calculations

# Converts time to decimal
def time_to_decimal(t):
    time = t.split(':')
    decimal_time = int(time[0]) + (int(time[1]) / 60)
    return decimal_time

# Converts decimal to time
def decimal_to_time(d):
    minutes = math.floor(d)
    seconds = (d - math.floor(d)) * 60
    time = str(int(minutes)) + ":" + str(int(seconds))
    return time
    
# Converts countdown to increasing decimal time
def increasing_time(t, start = 20):
    time = start - time_to_decimal(t)
    return time

In [None]:
# Obtains plot data

# Gets data through selenium
def retrieve_html_selenium(url):
    driver = webdriver.Chrome()
    driver.get(url)
    data = driver.page_source
    driver.quit()
    return data

def get_home_score(score):
    return int(score.split('-')[0])

def get_away_score(score):
    return int(score.split('-')[1])

# Returns list containing decimal time stamps, home scores, and away scores
def extract_times_and_scores(period):
    times_list = []
    home_scores_list = []
    away_scores_list = []
    score_tags = period.find_all(class_='score')
    
    for tag in range(1, len(score_tags), 2):
        if score_tags[tag].string != None:
            times_list.append(str(score_tags[tag - 1].string))
            home_scores_list.append(int(str(get_home_score(score_tags[tag].string))))
            away_scores_list.append(int(str(get_away_score(score_tags[tag].string))))
    
    return [times_list, home_scores_list, away_scores_list]

# Returns match_data list of times, home scores, and away scores by period
def extract_play_data(matchpage_soup):
    match_data = []
    periods = matchpage_soup.find_all(class_='allperiod')
        
    for p in periods:
        match_data.append(extract_times_and_scores(p))
        
    return match_data
    
#extract_play_data('https://www.ncaa.com/game/basketball-men/d1/2018/03/15/oklahoma-rhode-island/play-by-play')

In [None]:
# Obtains match information

# Match information
# - home_team
# - away_team
# - home_seed
# - away_seed
# - home_score
# - away_score
# - round_number
# - date
# - loc
# - ot_status
def extract_match_info(matchpage_soup):
    info = {}
    banner = matchpage_soup.find(id='score-breakdown')
    
    get_teams_and_seeds(banner, info)
    get_final_score(banner, info)
    get_round_number(banner, info)
    get_match_date(banner, info)
    get_match_loc(banner, info)
    get_ot_status(banner, info)
    
    return info

def get_teams_and_seeds(banner, info):
    school_tags = banner.find_all(class_='school')
    info['home_team'] = str(school_tags[0].find('a').string)
    info['away_team'] = str(school_tags[1].find('a').string)
    info['home_seed'] = str(school_tags[0].find(class_='ranking').string)
    info['away_seed'] = str(school_tags[1].find(class_='ranking').string)
    return None

def get_final_score(banner, info):
    #game_tag = banner.find(class_='game-state')
    score_tags = banner.find_all(class_='score')
    info['home_score'] = str(score_tags[0].string)
    info['away_score'] = str(score_tags[1].string)
    return None

def get_round_number(banner, info):
    round_tag = banner.find(class_='bracket-round')
    info['round_number'] = str(round_tag.string)
    return None

def get_match_date(banner, info):
    date_tag = banner.find(class_='game-date')
    info['date'] = str(date_tag.string).strip()
    return None

def get_match_loc(banner, info):
    loc_tag = banner.find(class_='location')
    info['loc'] = str(loc_tag.string)
    return None

def get_ot_status(banner, info):
    status_tag = banner.find(class_='game-time-info')
    info['ot_status'] = str(status_tag.string).strip()
    return None
    
#extract_match_info(create_soup(retrieve_html_selenium('https://www.ncaa.com/game/basketball-men/d1/2018/03/15/oklahoma-rhode-island/play-by-play')))

In [None]:
# Generates plots

# Combines increasing times and scores from all periods
def combine_match(match_data):
    combined_data = [[],[],[]]
    
    for period in range(len(match_data)):
        if period < 2:
            for time in (match_data[period][0]):
                combined_data[0].append(increasing_time(time) + (period * 20))
        else:
            for time in (match_data[period][0]):
                combined_data[0].append(increasing_time(time, 5) + (2 * 20) + ((period - 2) * 5))
                
        combined_data[1] = combined_data[1] + match_data[period][1]
        combined_data[2] = combined_data[2] + match_data[period][2]
            
    return combined_data

# Return difference of scores in list
def score_difference(period_data):
    diff = []
    
    for score in range(len(period_data[1])):
        diff.append(period_data[1][score] - period_data[2][score])
        
    return diff

# Extends ends for difference plot
# Needs update for multiple overtime
def extend_difference_ends(period_data):
    diff = score_difference(period_data)
    if period_data[0][-1] > 40:
        return [[0] + period_data[0] + [45], [0] + diff + [diff[-1]]]
    else:
        return [[0] + period_data[0] + [40], [0] + diff + [diff[-1]]]

# Removes multiple point values for certain times
def remove_doubles(x,y):
    x_out = [x[0]]
    y_out = [y[0]]
    for i in range(1,len(x)):
        if x[i] == x[i-1]:
            y_out[-1] = y[i]
        else:
            x_out.append(x[i])
            y_out.append(y[i])
            
    return [x_out, y_out]

# Generates difference plot
def make_difference_plot(period_data, info):
    # Data adjustments
    extended_data = extend_difference_ends(period_data)
    fixed_data = remove_doubles(extended_data[0],extended_data[1])
    
    # Smooths plot with interpolation
    #time_smooth = np.linspace(fixed_data[0][0],fixed_data[0][-1],1000)
    #diff_smooth = splev(time_smooth,splrep(fixed_data[0],fixed_data[1]))
    
    # Plot style
    plot_color = 'xkcd:azure'
    
    title_font = {'name':'Sathu',
                  'color':'black',
                  'weight':'bold',
                  'size':24
                 }
    
    axis_font = {'name':'Sathu',
                  'color':'black',
                  'weight':'bold',
                  'size':16
                 }

    team_font = {'name':'Sathu',
                  'color':'black',
                  'weight':'bold',
                  'size':18
                 }
    
    # difference plot figure
    plt.figure("difference", figsize=(15,7.5))
    #plt.plot(time_smooth,diff_smooth,c=plot_color,linewidth=2)
    plt.plot(fixed_data[0],fixed_data[1],c=plot_color,linewidth=2)
    
    # text labels
    plt.title('('+info['home_seed']+') '+info['home_team']+' - vs - '+info['away_team']+' ('+info['away_seed']+')',
              fontdict=title_font)
    plt.xlabel('Minutes',fontdict=axis_font)
    plt.ylabel('Point Difference',fontdict=axis_font)
    plt.text(1,21,info['home_team'] + ' ~ ' + info['home_score'],fontdict=team_font)
    plt.text(1,-24,info['away_team'] + ' ~ ' + info['away_score'],fontdict=team_font)
    
    # guide lines
    plt.grid(axis='y')
    plt.axvline(20,0.47,0.53,linewidth=2,c='gray')
    plt.axvline(40,0.47,0.53,linewidth=2,c='gray')
    plt.axhline(0,linewidth=3,c='black')
    
    # remove frame and ticks
    plt.subplot(111).spines['bottom'].set_color('white')
    plt.subplot(111).spines['left'].set_color('white')
    plt.subplot(111).spines['top'].set_color('white')
    plt.subplot(111).spines['right'].set_color('white')
    plt.tick_params(length=0)
    
    # plot limits
    plt.xlim(0,45)
    plt.ylim(-30,30)
    
    # plot ticks
    plt.yticks([-25,-20,-15,-10,-5,0,5,10,15,20,25],
               [25,20,15,10,5,0,5,10,15,20,25],fontsize=12)
    plt.xticks([5,10,15,20,25,30,35,40,45],fontsize=14)
    
    # save and clear
    plt.savefig(info['round_number']+' - '+info['home_team']+' vs '+info['away_team']+".png")
    plt.gcf().clear()
    
    return None

# Generates all plots
def generate_plots(game_url):
    matchpage_soup = create_soup(retrieve_html_selenium(game_url))
    
    match_info = extract_match_info(matchpage_soup)
    match_data = extract_play_data(matchpage_soup)
    all_periods = combine_match(match_data)
    
    make_difference_plot(all_periods, match_info)
    
    return None

#generate_plots('https://www.ncaa.com/game/basketball-men/d1/2018/03/15/oklahoma-rhode-island/play-by-play')

In [None]:
# Operational Code

for date in dates:
    play_links = extract_play_links(date, True)
    
    for game_url in play_links:
        generate_plots(game_url)
        time.sleep(15)
        