In [None]:
!pip3 install icalendar
!pip3 install beautifulsoup4

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import slugify

clubs_url = "http://www.cpliege.be/caleclub.asp"

clubs_url_html = requests.get(clubs_url).text

soup = BeautifulSoup(clubs_url_html, 'html.parser')

# read html and get every links
clubs = soup.find_all('a')

# tranform to get a dict with club name and url
clubs_dict = {}
for club in clubs:
    # removes "all whitespace characters (space, tab, newline, return, formfeed)"
    club_name = " ".join(club.text.split())
    clubs_dict[club_name] = "http://www.cpliege.be/" + club['href']
    
print(clubs_dict)    


In [None]:

def get_club_agenda(club_url):
    agenda = pd.read_html(club_url, header=5)[0]

    # remove if column "Unnamed: 7" is empty OR starts with "(" and ends with ")"
    agenda = agenda[~(agenda["Unnamed: 7"].isnull() | agenda["Unnamed: 7"].str.startswith("(") & agenda["Unnamed: 7"].str.endswith(")"))]
        
    print(len(agenda))

    # rename columns
    agenda.columns = ["Code", "Unknown", "Weekday", "Date", "Hour", "Équipe 1", "Équipe 2", "Catégorie", "Autre"]

    # drop "Unknown" column
    agenda.drop("Unknown", axis=1, inplace=True)

    agenda = agenda[~agenda["Date"].isnull()]
    
    # Hour to string
    agenda["Hour"] = agenda["Hour"].astype(str)

    # replace . in hour by :
    agenda["Hour"] = agenda["Hour"].str.replace(".", ":")
    
    # if Hour contains only one number after : add a 0
    agenda["Hour"] = agenda["Hour"].apply(lambda x: x if len(x.split(":")[1]) == 2 else x + "0")

    # remove Weekday column
    agenda.drop("Weekday", axis=1, inplace=True)

    # Date as datetime
    agenda["Date"] = pd.to_datetime(agenda["Date"], format="%d/%m/%y")

    # order by catégorie and then by date
    agenda.sort_values(by=["Catégorie", "Date"], inplace=True)

    # select U10
    # agenda = agenda[agenda["Catégorie"] == "U 10 - MM F"]

    # remove if Data is NaT

    return agenda

for club_name, club_url in clubs_dict.items():
    print(club_name, club_url)
    agenda = get_club_agenda(club_url)
    agenda.to_csv("./data/" + slugify.slugify(club_name) + ".csv", index=False)

In [None]:
from icalendar import Calendar, Event
import uuid
from datetime import datetime

def generate_ics(agenda, url):
    for event in agenda.iterrows():
        name = event[1]["Catégorie"] + ': '+ event[1]["Équipe 1"] + " vs " + event[1]["Équipe 2"]
        
        startime = pd.to_datetime(event[1]["Date"]).strftime("%Y-%m-%d") + " " + event[1]["Hour"]
        
        endtime = pd.to_datetime(startime) + pd.Timedelta(minutes=90)
        
        # set brussels timezone
        startime = pd.to_datetime(startime).tz_localize('Europe/Brussels')
        endtime = pd.to_datetime(endtime).tz_localize('Europe/Brussels')
        
        e.begin = startime.strftime("%Y-%m-%d %H:%M:%S")
        e.end = endtime.strftime("%Y-%m-%d %H:%M:%S")
        
        # location
        # location = event[1]["Équipe 1"]
        location = ""
        
        filename = "./data/" + event[1]["Catégorie"] + " - " + event[1]["Équipe 1"] + " vs " + event[1]["Équipe 2"] + ".ics"
        
        cal = Calendar()
        cal.add('prodid', 'Made with ❤️ by Martin Erpicum')
        cal.add('version', '2.0')
        cal.add('method', "PUBLISH")
        cal.add('X-WR-TIMEZONE', "Europe/Brussels")
        
        e = Event()
        e.add('summary', name)
        e.add('dtstart', startime)
        e.add('dtend', endtime)
        e.add('dtstamp', datetime.now())
        e.add('location', location)
        e.add('priority', 5)
        e.add('sequence', 1)
        e.add('description', name)
        e.add('url', url)
        e.add('uid', str(uuid.uuid4()))
        cal.add_component(e)
        
        # save to file
        f = open(filename, 'wb')
        f.write(cal.to_ical())
        f.close()
        


