In [1]:
from bs4 import *
from bs4.element import PageElement
import time
import requests
import re
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import json
import pickle
from pathlib import Path
import sys
import os
sys.path.insert(0,'../../')
import scripts.base as base
import inspect
import difflib

BBREF_HOME = 'https://www.basketball-reference.com'
BBREF_SEASON = 'https://www.basketball-reference.com/leagues'
BBREF_TEAMS = 'https://www.basketball-reference.com/teams'
BBREF_PLAYERS = 'https://www.basketball-reference.com/players'
BBREF_BOXSCORES = 'https://www.basketball-reference.com/boxscores'

VAR_LOCAL_HOST = base.LOCAL_HOST

In [18]:


def set_variable(variable_name: str, value):
    """
    Sets a variable in the current module with the given value.
    :param variable_name: The name of the variable to set.
    :param value: The value to set for the variable.
    """
    try:
        globals()[variable_name] = value
        print(f"{variable_name} set to {value} in the current module.")
    except Exception as e:
        print(f"Error setting variable: {e}")

class BBRefHTML:
    def __init__(self,href='') -> None:
        self.href_ = href
        self.html_url_ = f'{BBREF_HOME}{href}'
        self.params = { param_name:locals()[param_name] 
                        for param_name in inspect.signature(self.__init__).parameters 
                        if param_name != 'self'}

    def fetch(self):
        self.html_text_  = base.fetch_html(self.html_url_,source=VAR_LOCAL_HOST)
        self.html_soup_  = base.bs4_soup(self.html_text_)

    def pull(self):
        self.html_text_online_  = base.fetch_html(self.html_url_)
        self.html_soup_online_  = base.bs4_soup(self.html_text_)

    def delta(self, element_selector: str):
        # Find the difference between the pulled and local version
        local_element = self.html_soup_.select_one(element_selector)
        online_element = self.html_soup_online_.select_one(element_selector)

        if local_element is None or online_element is None:
            print("One or both of the elements could not be found.")
            return None

        local_lines = local_element.prettify().splitlines()
        online_lines = online_element.prettify().splitlines()
        differences = list(difflib.unified_diff(local_lines, online_lines))
        return differences

    def save(self,content_only=True):
        save_url = f"{VAR_LOCAL_HOST}/{self.html_url_}" 
        if not self.html_url_.endswith('.html'):
            save_url += ".html"

        if content_only and self.content_div():
            html_text = self.content_div().text
        else:
            html_text = self.html_text_
        base.save(html_text,f'{save_url}')

    def content_div(self):
        return self.html_soup_.find('div',{'id':'content'})
    
    def filter_div(self):
        return self.html_soup_.find('div',{'class':'filter'}) 
    
class SeasonsListHTML(BBRefHTML):
    def __init__(self, href='/leagues') -> None:
        super().__init__(href)

    def parse_hrefs(self):
        seasons_hrefs = [a['href'] for th in self.html_soup_.find_all('th', {'data-stat': 'season'}) 
                                for a in th.find_all('a')]
        return seasons_hrefs
    
class SeasonHTML(BBRefHTML):
    def parse_hrefs(self):
        season_boxscores_hrefs = []
        if self.filter_div(): 
            # Iterate through each month to get the entire list
            monthly_boxscores_hrefs = [a['href'] for a in self.filter_div().select('a')]
            for monthly_boxscores_href in monthly_boxscores_hrefs:
                monthly_boxscores = BBRefHTML(href=monthly_boxscores_href)
                monthly_boxscores_schedule_table = monthly_boxscores.html_soup_.find('table', {'id': 'schedule'})
                season_boxscores_hrefs += [a['href'] for th in monthly_boxscores_schedule_table.find_all('td',{'data-stat':'box_score_text'}) 
                                           for a in th]
        else:
            # Otherwise just return the main page schedule
            boxscores_schedule_table = self.html_soup_.find('table', {'id': 'schedule'})
            season_boxscores_hrefs = [a['href'] for th in boxscores_schedule_table.find_all('td',{'data-stat':'box_score_text'}) 
                                      for a in th]
        return season_boxscores_hrefs

class BoxscoresHTML(BBRefHTML):
    def parse_hrefs(self):
        box_scores_hrefs = []
        if self.filter_div():
            sub_boxscores_hrefs = [a['href'] for a in self.filter_div().select('a')]
            box_scores_hrefs += sub_boxscores_hrefs
        else:
            box_scores_hrefs.append(self.href_)
        return box_scores_hrefs  

class BoxscoresPlayByPlayHTML(BoxscoresHTML):
    
    pass

class BoxscoresPlusMinusHTML(BoxscoresHTML):
    pass

class BoxscoresShotChartHTML(BoxscoresHTML):
    pass

class PlayersListHTML(BBRefHTML):
    pass

class PlayerHTML(BBRefHTML):
    pass

class TeamsListHTML(BBRefHTML):
    pass

class TeamHTML:
    pass

# SeasonHTML.get_seasons_hrefs(from_local=False)

# a = BBRefHTML(href='/leagues',from_local=False)

In [None]:
# Pipeline

# set_variable('VAR_LOCAL_HOST',base.LOCAL_HOST) # Change this if you want to save to a different folder
href = ''
html_obj = BBRefHTML(href)
update = True
# pull new files
# update files

# Try fetch from local host
local_file_exists  = html_obj.fetch()
if not local_file_exists:
    # If fails, pull the online version 
    online_file_exists = html_obj.pull()
    if online_file_exists:
        # If successful, save the to local host
        html_obj.save()
else:
    # If file exists,  
    if update:
        online_file_exists = html_obj.pull()
        delta = html_obj.delta()
        if delta:
            html_obj.save() # save online version

    html_obj.parse()



In [2]:
html_text = base.fetch_html(f'{BBREF_HOME}/boxscores/200910270CLE.html',source=VAR_LOCAL_HOST)
html_soup = base.bs4_soup(None)
html_soup

Could not make soup, due to object of type 'NoneType' has no len()


In [None]:
#  global.nonempty_tables_num: 3, table_count: 3 

   
#  no Local/Partials/NoteBottom.tt2 

# <div class="section_wrapper" id="bottom_nav">
# <div class="section_heading"><span class="section_anchor" data-label="Team and League Schedules" data-no-inpage="1" id="inner_nav_bottom_link"></span><h2>Team and League Schedules</h2></div>
# <div class="section_content" id="bottom_nav_container">
# <p><a href="/boxscores/"><u>Dec 25, 2011 NBA Scores &amp; Boxes</u></a></p>
# <ul class="">
# <li><a href="/teams/MIA/2012_games.html"><u>Miami Heat Schedule</u></a></li>
# <li><a href="/teams/DAL/2012_games.html"><u>Dallas Mavericks Schedule</u></a></li>
# <li><a href="/leagues/NBA_2012_games.html"><u>2011-12 NBA Schedule &amp; Results</u></a></li>
# </ul></div></div>
# </div>

In [3]:
class BBRefHTML:
    def __init__(self,href='') -> None:
        # Init the object
        self.href_ = href
        self.html_url_ = f'{BBREF_HOME}{href}'
        self.html_text_ = None
        self.html_soup_ = None

    def fetch(self,content_only=True):
        self.html_text_  = base.fetch_html(self.html_url_,source=VAR_LOCAL_HOST)
        self.html_soup_  = base.bs4_soup(self.html_text_)
        if content_only and self.html_soup_:
            self.html_soup_ = self.content_div()
        return self.html_soup_
    
    def pull(self,content_only=True):
        self.html_text_local_ = self.html_text_ #local version
        self.html_soup_local_ = self.html_soup_
        self.html_text_  = base.fetch_html(self.html_url_,source=None) #online version
        self.html_soup_  = base.bs4_soup(self.html_text_)
        if content_only and self.html_soup_:
            self.html_soup_ = self.content_div()
        return self.html_soup_
    
    def save(self,content_only=True):
        save_url = f"{VAR_LOCAL_HOST}/{self.html_url_}" 
        if not self.html_url_.endswith('.html'):
            save_url += ".html"
        if content_only and self.html_soup_ and self.content_div():
            html_text = str(self.content_div())
        else:
            html_text = self.html_text_

        try:
            base.save(html_text,f'{save_url}')
            return True
        except Exception as e:
            print(f'Could not make soup, due to {e}')
            return False
    
    def delta(self, element_selector = None):
        # Find the difference between the online and local version
        local_element = self.html_soup_local_
        online_element = self.html_soup_
        if element_selector:
            local_element = local_element.select_one(element_selector)
            online_element = online_element.select_one(element_selector)
        if local_element is None or online_element is None:
            print("One or both of the elements could not be found.")
            return None
        local_lines = local_element.prettify().splitlines()
        online_lines = online_element.prettify().splitlines()
        differences = list(difflib.unified_diff(local_lines, online_lines))
        return differences
    
    def content_div(self):
        if self.html_soup_.get('id') == 'content':
            return self.html_soup_
        else:
            return self.html_soup_.find('div',{'id':'content'})
    
    def filter_div(self):
        return self.html_soup_.find('div',{'class':'filter'}) 
    
    def sync(self,update=True):
        # The goal is to sync an html file between the local version and the online version. 
        # It first try to pulls and saves the file from the web if the file doesnt exist yet in the local database. 
        # Otherwise it checks for the differences and notify of the differences if exists, or lack thereof. 
        # It then prompts to the user if they want to go ahead and pull the updated version
        # STATUS_CODES
        STATUS_ERROR_SAVING_FILE   = -10
        STATUS_ONLINE_NOT_EXISTED  = -1
        STATUS_NO_NEW_UPDATES      = 0
        STATUS_PULLED_NEW_FROM_ONLINE  = 1
        STATUS_NEW_UPDATES_RESOLVED_N  = 2
        STATUS_NEW_UPDATES_RESOLVED_Y  = 3

        local_file_exists = self.fetch()
        # If local files doesn't exist
        if not local_file_exists:
            online_file_exists = self.pull()
            if not online_file_exists:
                return STATUS_ONLINE_NOT_EXISTED
            else:
                saved = self.save()
                if saved:
                    return STATUS_PULLED_NEW_FROM_ONLINE
                else:
                    return STATUS_ERROR_SAVING_FILE
        else:
            online_file_exists = self.pull()
            if not online_file_exists:
                return STATUS_ONLINE_NOT_EXISTED
            else:
                delta = self.delta()
                if not delta:
                    return STATUS_NO_NEW_UPDATES
                else:
                    for diff_line in delta:
                        print(diff_line)
                    if not update:
                        return STATUS_NEW_UPDATES_RESOLVED_N
                    else:
                        saved = self.save()
                        if saved:
                            return STATUS_NEW_UPDATES_RESOLVED_Y
                        else:
                            return STATUS_ERROR_SAVING_FILE
                    
                
href = '/boxscores/201112250DAL.html'
href = '/boxscores/201105310MIA.html'
html_obj = BBRefHTML(href)

a = html_obj.sync(update=False)
a        

0

In [56]:
html_obj.pull()

<div class="box" id="content" role="main">
<h1>Miami Heat at Dallas Mavericks Box Score, December 25, 2011</h1>
<div class="section_wrapper setup_commented commented" id="all_other_scores">
<div class="section_heading assoc_other_scores" id="other_scores_sh">
<span class="section_anchor" data-label="All Games This Date" id="other_scores_link"></span><h2></h2>
</div><div class="placeholder"></div>
<div class="section_content" id="div_other_scores">
<div class="game_summaries compressed">
<h2>NBA Scores — Dec 25, 2011</h2>
<div class="game_summary nohover current">
<table class="teams poptip" data-tip="Miami Heat at Dallas Mavericks">
<tbody>
<tr class="winner">
<td><a href="/teams/MIA/2012.html">MIA</a></td>
<td class="right">105</td>
<td class="right gamelink">
<a href="/boxscores/201112250DAL.html">F<span class="no_mobile">inal</span></a>
</td>
</tr>
<tr class="loser">
<td><a href="/teams/DAL/2012.html">DAL</a></td>
<td class="right">94</td>
<td class="right"> 
			</td>
</tr>
</tbody>

In [36]:
# html_obj.html_soup_local_.prettify().splitlines()
# html_obj.html_soup_.prettify().splitlines()
html_obj.delta()

[]