In [None]:
# !pip install requests
# !pip install beautifulsoup4
# !pip install pandas
# !pip install openpyxl # May be included in pandas installation

In [None]:
# Clear variables
%reset

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
top_level_dict = {
    "Union-wide": {
        "Union Symposia (US)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5213",
        },
        "Great Debates (GDB)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5214",
        },
        "Medal & Award Lectures and Celebrations (MAL)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5215",
        },
        "Short Courses (SC)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5216",
        },
        "Education and Outreach Sessions (EOS)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5217",
        },
        "Networking (NET)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5218",
        },
        "Feedback and admin meetings (FAM)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5219",
        },
        "Press conferences (PC)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5220",
        },
    },
    "Cross-cutting themes":{
        "Art-Science": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessions-of-special-interest/Art",
        },
        "GeoCinema": {
            "url": "https://webforms.copernicus.org/EGU24/pop-up-geocinema-events/t",
        },
        "Policy": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessions-of-special-interest/Policy",
        },
        "EDI": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessions-of-special-interest/EDI",
        },
    },
    "Community-led": {
        "Townhall Meetings (TM)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5221",
        },
        "Splinter Meetings (SPM)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5222",
        },
        "Lectures organized by related scientific societies (LRS)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5223",
        },
        "Pop-up networking events": {
            "url": "https://webforms.copernicus.org/EGU24/pop-up-networking-events/all",
        },
        "Pop-up exhibitor events": {
            "url": "https://webforms.copernicus.org/EGU24/pop-up-exhibitor-events",
        },
    },
    "Inter- and Transdisciplinary Sessions": {
        "Digital Geosciences": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5458",
        },
        "Impacts of Climate and Weather in an Inter-and Transdisciplinary context": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5459",
        },
        "Environment and Society in Geosciences": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5460",
        },
        "Risk, Resilience and Adaptation": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5461",
        },
        "General ITS sessions": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5462",
        },
    },
    "Disciplinary sessions": {
        "Atmospheric Sciences (AS)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5225",
            "num_special": 2
        },
        "Biogeosciences (BG)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5226",
            "num_special": 2
        },
        "Climate: Past, Present & Future (CL)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5227",
            "num_special": 4
        },
        "Cryospheric Sciences (CR)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5228",
            "num_special": 2,
        },
        "Earth Magnetism & Rock Physics (EMRP)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5229",
            "num_special": 1,
        },
        "Energy, Resources and the Environment (ERE)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5230",
            "num_special": 2,
        },
        "Earth & Space Science Informatics (ESSI)":{
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5231",
            "num_special": 3,
        },
        "Geodesy (G)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5232",
            "num_special": 2,
        },
        "Geodynamics (GD)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5233",
            "num_special": 2,
        },
        "Geosciences Instrumentation & Data Systems (GI)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5234",
            "num_special": 2,
        },
        "Geomorphology (GM)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5235",
            "num_special": 3,
        },
        "Geochemistry, Mineralogy, Petrology & Volcanology (GMPV)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5236",
            "num_special": 2,
        },
        "Hydrological Sciences (HS)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5237",
            "num_special": 4,
        },
        "Natural Hazards (NH)":{
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5238",
            "num_special": 4,
        },
        "Nonlinear Processes in Geosciences (NP)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5239",
            "num_special": 2,
        },
        "Ocean Sciences (OS)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5240",
            "num_special": 3,
        },
        "Planetary & Solar System Sciences (PS)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5241",
            "num_special": 2,
        },
        "Seismology (SM)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5242",
            "num_special": 2,
        },
        "Stratigraphy, Sedimentology & Palaeontology (SSP)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5243",
            "num_special": 2,
        },
        "Soil System Sciences (SSS)":{
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5244",
            "num_special": 3,
        },
        "Solar-Terrestrial Sciences (ST)":{
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5245",
            "num_special": 3,
        },
        "Tectonics & Structural Geology (TS)": {
            "url": "https://meetingorganizer.copernicus.org/EGU24/sessionprogramme/5246",
            "num_special": 2,
        },
    }
}

In [3]:
# Only works for disciplinary

def get_prog_dict(top_level_dict, dict_item):

    programme_dict = {}

    for programme, item in top_level_dict[dict_item].items():

        print(programme)

        page = requests.get(item['url'])
        soup = BeautifulSoup(page.content, 'html.parser')


        # Add to programme dict
        for pg in soup.find_all(class_='s2simulatorSecondCol'):
            pg_items = pg.get_text(strip=True).split('–')
            pg_code = pg_items[0].strip()
            pg_title = pg_items[1].strip()

        programme_dict[pg_code] = pg_title

    return programme_dict

In [4]:
programme_dict = get_prog_dict(top_level_dict, "Disciplinary sessions")

Atmospheric Sciences (AS)
Biogeosciences (BG)
Climate: Past, Present & Future (CL)
Cryospheric Sciences (CR)
Earth Magnetism & Rock Physics (EMRP)
Energy, Resources and the Environment (ERE)
Earth & Space Science Informatics (ESSI)
Geodesy (G)
Geodynamics (GD)
Geosciences Instrumentation & Data Systems (GI)
Geomorphology (GM)
Geochemistry, Mineralogy, Petrology & Volcanology (GMPV)
Hydrological Sciences (HS)
Natural Hazards (NH)
Nonlinear Processes in Geosciences (NP)
Ocean Sciences (OS)
Planetary & Solar System Sciences (PS)
Seismology (SM)
Stratigraphy, Sedimentology & Palaeontology (SSP)
Soil System Sciences (SSS)
Solar-Terrestrial Sciences (ST)
Tectonics & Structural Geology (TS)


In [5]:

def get_data(top_level_dict, dict_item, programme_dict):

    data = {
        'Category': [],
        'Top Level Session': [],
        'Programme Code': [],
        'Programme Title': [],
        'Programme URL': [],
        'Session Code': [],
        'Session Title': [],
        'Session Co-Organizer': [],
        'Session Conveners': [],
        'Session Orals Schedule': [],
        'Session Posters Onsite Attendance': [],
        'Session Posters Onsite Display': [],
        'Session Posters Virtual Attendance': [],
        'Session Posters Virtual Display': [],
        'Session PICO Schedule': [],
        'Session Other Schedule': []
    }

    for programme, item in top_level_dict[dict_item].items():

        print('\n', programme)

        page = requests.get(item['url'])
        soup = BeautifulSoup(page.content, 'html.parser')

        if 'num_special' in item:
            num_special = item['num_special']
        else:
            num_special = 0

        for block in soup.find_all(class_='co_mto_programme-session-block mb-0')[num_special:]:

            block_code = block.find_all(class_='co_mto_programme-session-block-number-number')[0].get_text(strip=True)
            block_title = ''
            try:
                block_title = block.find_all(class_='co_mto_programme-session-block-title active')[0].get_text(strip=True)
            except:
                print("No block title")

            print(block_code, ' - ', block_title)

            try:
                block_co_organizer = block.find_all(class_='co_mto_programme-session-block-cosponsoring-coorganizing')[0].get_text(strip=True)
            except:
                block_co_organizer = ''

            block_conveners = block.find_all(class_='co_mto_programme-session-block-so')[0].get_text(strip=True)

            orals_schedule = ''
            posters_onsite_attendance = ''
            posters_onsite_display = ''
            posters_virtual_attendance = ''
            posters_virtual_display = ''
            pico_schedule = ''
            other_schedule = ''

            for schedule in block.find_all(class_='mo_scheduling_string'):

                sch_text = schedule.get_text(strip=True)

                if "Orals|Enter live session" in sch_text:
                    orals_schedule = sch_text.split("Orals|Enter live session")[1].strip()
                elif "Posters on site|Attendance" in sch_text:
                    t1 = sch_text.split("|")
                    posters_onsite_attendance = t1[1].split("Attendance")[1].strip()
                    posters_onsite_display = t1[2].split("Display")[1].strip()
                elif "Posters virtual|Enter live session" in sch_text:
                    t2 = sch_text.split("|")
                    posters_virtual_attendance = t1[1].split("Attendance")[1].strip()
                    posters_virtual_display = t1[2].split("Display")[1].strip()
                elif "PICO|Enter live session" in sch_text:
                    pico_schedule = sch_text.split("PICO|Enter live session")[1].strip()
                else:
                    print(sch_text)
                    other_schedule = sch_text.strip()

            data['Category'].append(dict_item)
            data['Top Level Session'].append(programme)

            prog_code = block_code.split('.')[0]
            data['Programme Code'].append(prog_code)
            try:
                data['Programme Title'].append(programme_dict[prog_code])
            except:
                print("Cannot find Programme Code:", prog_code)
                data['Programme Title'].append('')

            data['Programme URL'].append(item['url'])
            data['Session Code'].append(block_code)
            data['Session Title'].append(block_title)
            data['Session Co-Organizer'].append(block_co_organizer)
            data['Session Conveners'].append(block_conveners)
            data['Session Orals Schedule'].append(orals_schedule)
            data['Session Posters Onsite Attendance'].append(posters_onsite_attendance)
            data['Session Posters Onsite Display'].append(posters_onsite_display)
            data['Session Posters Virtual Attendance'].append(posters_virtual_attendance)
            data['Session Posters Virtual Display'].append(posters_virtual_display)
            data['Session PICO Schedule'].append(pico_schedule)
            data['Session Other Schedule'].append(other_schedule)

    return data
        

In [None]:
# page = requests.get( top_level_dict['Disciplinary sessions']['Atmospheric Sciences (AS)'] )

In [None]:
# soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
# programme_dict = {}

# for pg in soup.find_all(class_='s2simulatorSecondCol'):
#     pg_items = pg.get_text(strip=True).split('–')
#     pg_code = pg_items[0].strip()
#     pg_title = pg_items[1].strip()

#     programme_dict[pg_code] = pg_title

In [None]:
# for block in soup.find_all(class_='co_mto_programme-session-block mb-0')[2:]:

#     block_code = block.find_all(class_='co_mto_programme-session-block-number-number')[0].get_text(strip=True)
#     block_title = block.find_all(class_='co_mto_programme-session-block-title active')[0].get_text(strip=True)

#     try:
#         block_co_organizer = block.find_all(class_='co_mto_programme-session-block-cosponsoring-coorganizing')[0].get_text(strip=True)
#     except:
#         block_co_organizer = ''

#     block_conveners = block.find_all(class_='co_mto_programme-session-block-so')[0].get_text(strip=True)

#     for schedule in block.find_all(class_='mo_scheduling_string'):

#         sch_text = schedule.get_text(strip=True)

#         if "Orals|Enter live session" in sch_text:
#             orals_schedule = sch_text.split("Orals|Enter live session")[1].strip()
#         elif "Posters on site|Attendance" in sch_text:
#             t1 = sch_text.split("|")
#             posters_onsite_attendance = t1[1].split("Attendance")[1].strip()
#             posters_onsite_display = t1[2].split("Display")[1].strip()
#         elif "Posters virtual|Enter live session" in sch_text:
#             t2 = sch_text.split("|")
#             posters_virtual_attendance = t1[1].split("Attendance")[1].strip()
#             posters_virtual_display = t1[2].split("Display")[1].strip()
#         elif "PICO|Enter live session" in sch_text:
#             pico_schedule = sch_text.split("PICO|Enter live session")[1].strip()
#         else:
#             print(sch_text)
#             other_schedule = sch_text.split("Enter live session")[1].strip()

#     data['Session Code'].append(block_code)
#     data['Session Title'].append(block_title)
#     data['Session Co-Organizer'].append(block_co_organizer)
#     data['Session Conveners'].append(block_conveners)
#     data['Session Orals Schedule'].append(orals_schedule)
#     data['Session Posters Onsite Attendance'].append(posters_onsite_attendance)
#     data['Session Posters Onsite Display'].append(posters_onsite_display)
#     data['Session Posters Virtual Attendance'].append(posters_virtual_attendance)
#     data['Session Posters Virtual Display'].append(posters_virtual_display)

In [6]:
def check_day(text, day, existing_value):
    if existing_value == 1:
        return 1
    else:
        return 1 if day in text else 0

def get_df(data):
    df = pd.DataFrame(data)

    # List of days of the week
    days_of_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri']

    for day in days_of_week:
        df[day] = 0

    # Create new columns for each day of the week
    for day in days_of_week:
        for entry in ['Session Orals Schedule', 'Session Posters Onsite Attendance', 'Session Posters Onsite Display', 'Session Posters Virtual Attendance', 'Session Posters Virtual Display', 'Session PICO Schedule', 'Session Other Schedule']:
            df[day] = df.apply(lambda row: check_day(row[entry], day, row[day]), axis=1)
    
    return df

In [7]:
data_union_wide = get_data(top_level_dict, 'Union-wide', programme_dict)
df_union_wide = get_df(data_union_wide)

data_cross_cutting = get_data(top_level_dict, 'Cross-cutting themes', programme_dict)
df_cross_cutting = get_df(data_cross_cutting)

data_community = get_data(top_level_dict, 'Community-led', programme_dict)
df_community = get_df(data_community)

data_trans = get_data(top_level_dict, 'Inter- and Transdisciplinary Sessions', programme_dict)
df_trans = get_df(data_trans)

data_disciplinary = get_data(top_level_dict, 'Disciplinary sessions', programme_dict)
df_disciplinary = get_df(data_disciplinary)


 Union Symposia (US)
US1  -  Advancing Measurements and Observations in the Geosciences
Cannot find Programme Code: US1
US2  -  Climate emergency, human agency: making sense of the current state of scientific knowledge on climate change to strengthen climate literacy
Programme|Enter live sessionMon, 15 Apr, 08:30–10:15(CEST)Room E1
Cannot find Programme Code: US2
US3  -  Bridging the scales: The Arctic methane and permafrost challenge
Cannot find Programme Code: US3
US4  -  Deep-time Digital Earth
Cannot find Programme Code: US4
US5  -  The EU Critical Raw Materials Act – how geoscientists can directly inform European policy and regulation
Programme|Enter live sessionMon, 15 Apr, 10:45–12:30(CEST)Room E1
Cannot find Programme Code: US5
US6  -  Misunderstanding or malice? Getting to the bottom of geoscience disinformation
Programme|Enter live sessionWed, 17 Apr, 08:30–10:15(CEST)Room E1
Cannot find Programme Code: US6

 Great Debates (GDB)
GDB1  -  Plate motions were constant through g

In [8]:
df_final = pd.concat([df_union_wide, df_cross_cutting, df_community, df_trans, df_disciplinary], ignore_index=True)

In [9]:
df_final.to_excel('egu_schedule.xlsx', index=False)