In [1]:
#!pip install beautifulsoup4

In [2]:
import pandas as pd
import numpy as np
import requests
import lxml
from bs4 import BeautifulSoup
import urllib.request 
import sys
import time
from datetime import datetime

In [3]:
df_player_scores = pd.read_csv('../data/player_scores.csv')

In [4]:
df_player_scores

Unnamed: 0,player_name,season,week,score
0,Marco Wölfli,100,0,3
1,Marco Wölfli,100,1,1
2,Marco Wölfli,100,2,2
3,Marco Wölfli,100,3,2
4,Marco Wölfli,100,4,4
...,...,...,...,...
7454,Guillaume Hoarau,2100,9,0
7455,Guillaume Hoarau,2100,10,0
7456,Guillaume Hoarau,2100,11,0
7457,Guillaume Hoarau,2100,12,4


In [5]:
def html_call(url):
    # here we define the headers for the request
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:63.0) Gecko/20100101 Firefox/63.0'}

    # this request object will integrate your URL and the headers defined above
    req = urllib.request.Request(url=url, headers=headers)
    #req = urllib.request.Request(url=url, headers=headers)
    # calling urlopen this way will automatically handle closing the request
    
    time.sleep(0.5)
    with urllib.request.urlopen(req,timeout=5) as response:
        page_html = response.read()
    return page_html

In [6]:
def parse_play_date(play_date, season):    
    # skip EM (3, 13) and WM (8, 18)
    season_to_year = {
        0 : 2011,
        1 : 2012,
        3 : 2012,
        4 : 2013,
        5 : 2013,
        6 : 2014,
        8 : 2014,
        9 : 2015,
        10 : 2015,
        11 : 2016,
        13 : 2016,
        14 : 2017,
        15 : 2017,
        16 : 2018,
        18 : 2018,
        19 : 2019,
        20 : 2019}
    datetime_object = datetime.strptime(f'{play_date[4:]} {season_to_year[season]}','%d.%m. %H:%M %Y')
    return datetime_object  

In [7]:
def dream_date_parse():
    scores_per_week = {}
    season_list = []
    week_list = []
    date_times_list = []
    for season in range(21):
        # skip EM (3, 13) and WM (8, 18)
        if season in(2,12,7,17):
            continue
        print(f'season: {season+1}')
        season_str = f'{season+1}00'
        try:
            url = f'https://www.mydreamteam.ch/spielplan_super_league.php?season={season_str}'

            html = html_call(url)
            soup = BeautifulSoup(html,'html.parser')
            weeks = soup.find_all('table',{'class':'article'})
            week_nr = 0
            for week in weeks:
                df_week = pd.read_html(str(week))[0]
                play_date = df_week[((df_week[1] == 'YB') | (df_week[2] == 'YB'))][0].values[0]
                dto = parse_play_date(play_date, season)

                season_list.append(season_str)
                week_list.append(week_nr)
                date_times_list.append(dto)

                week_nr += 1
            
        except:
            print(sys.exc_info())
            return 0
        
    df_datestring_to_dto = pd.DataFrame({ 'season':season_list,'week':week_list, 'date_time':date_times_list})
    df_datestring_to_dto['season'] = df_datestring_to_dto['season'].astype(int)
    return df_datestring_to_dto

In [8]:
df_datestring_to_dto = dream_date_parse()

season: 1
season: 2
season: 4
season: 5
season: 6
season: 7
season: 9
season: 10
season: 11
season: 12
season: 14
season: 15
season: 16
season: 17
season: 19
season: 20
season: 21


In [9]:
df_player_scores_parsed = pd.merge(df_player_scores, df_datestring_to_dto,  how='left', left_on=['season','week'], right_on = ['season','week'])


In [11]:
df_player_scores_parsed

Unnamed: 0,player_name,season,week,score,date_time
0,Marco Wölfli,100,0,3,2011-07-21 16:00:00
1,Marco Wölfli,100,1,1,2011-07-28 16:00:00
2,Marco Wölfli,100,2,2,2011-08-04 16:00:00
3,Marco Wölfli,100,3,2,2011-08-10 19:00:00
4,Marco Wölfli,100,4,4,2011-09-01 16:00:00
...,...,...,...,...,...
7454,Guillaume Hoarau,2100,9,0,2019-10-06 16:00:00
7455,Guillaume Hoarau,2100,10,0,2019-10-19 19:00:00
7456,Guillaume Hoarau,2100,11,0,2019-10-27 16:00:00
7457,Guillaume Hoarau,2100,12,4,2019-11-03 16:00:00


In [12]:
df_player_scores_parsed.to_csv( '../data/player_scores_parsed.csv', index=False)