In [1]:
import pandas as pd
import datetime
import os
import sys
from datetime import date,timedelta
from bs4 import BeautifulSoup

players_path = r"C:\Corpora\scoreboard\Players\www.scoreboard.com\player"

In [2]:
def get_transfers(soup):
    transfers = []
    trasnfer_table = soup.find('table', {'class': 'base-table transfer-table'})
    if not trasnfer_table: return transfers
    trs = trasnfer_table.find('tbody').findAll('tr')

    for tr in trs:
        tds = tr.findAll('td')
        if len(tds) != 4: continue
        date = datetime.datetime.strptime( tds[0].text, "%b %d, %y" )

        from_a = tds[1].find('a')
        from_href = from_a["href"][6:-1].replace("/","_")

        to_a = tds[2].find('a')
        to_href = to_a["href"][6:-1].replace("/","_")

        reason = tds[3].text

        transfers.append([player_id, date, from_href, to_href, reason])
    return transfers

In [3]:
def get_injuries(soup):
    
    inj = []
    trasnfer_table = soup.find('table', {'class': 'base-table injury-history-table'})
    if not trasnfer_table: return inj
    trs = trasnfer_table.find('tbody').findAll('tr')

    for tr in trs:
        tds = tr.findAll('td')
        if len(tds) != 3: continue
        date_from = datetime.datetime.strptime( tds[0].text, "%b %d, %y" )
        
        date_to:datetime = None
        try:
            date_to = datetime.datetime.strptime( tds[1].text, "%b %d, %y" )
        except:
            pass
        
        reason = tds[2].text

        inj.append([player_id, date_from, date_to, reason])
    return inj

In [4]:
def get_dobs(player_soup):
    player_page = player_soup.find('div', {'id':'fsbody'})
    player_birthdate = player_page.find('div',  {'class':'player-birthdate'})
    if player_birthdate:
        timestamp = int(player_birthdate.find('script').text[36:-3])
    
        if timestamp < 0:
            dob = date(1970, 1, 1) + timedelta(seconds=timestamp)
        else:
            dob = date.fromtimestamp(timestamp)
        return dob
    return None

In [5]:
transfers = []
dobs = []
injuries = []
errs = []
for dirname, dirnames, filenames in os.walk(players_path):
    # print path to all filenames.
    for filename in filenames:
        if filename.endswith(".html"):
            player_id = dirname[len(players_path) + 1:].replace("\\","_")
            transfer_soup:BeautifulSoup = None
                
            with open(os.path.join(dirname, filename), "r", encoding='utf8') as r:
                transfer_soup = BeautifulSoup(r, "lxml")
            
            try:
                player_type_name = transfer_soup.find('div',  {'class':'player-type-name'}).text.strip()
            except:
                player_type_name = ""
            
            try:
                dobs.append([player_id, get_dobs(transfer_soup)])
                transfers.extend(get_transfers(transfer_soup))
                injuries.extend(get_injuries(transfer_soup))
            except:
                ve = sys.exc_info()[0]
                if player_type_name != "Coach":
                    errs.append([os.path.join(dirname, filename), str(ve)])
                

ages_df = pd.DataFrame(dobs, columns=['player_id', 'dob'])
transfers_df = pd.DataFrame(transfers, columns=['player_id', 'date', 'from', 'to', 'reason'])
injuries_df = pd.DataFrame(injuries, columns=['player_id','from', 'to', 'reason'])

In [10]:
print(len(transfers_df))
transfers_df.to_csv("transfers.csv")
print(len(ages_df))
ages_df.to_csv("dobs.csv")
print(len(injuries_df))
injuries_df.to_csv("injuries.csv")

190862
55762
17064
