In [17]:
# !pip install requests
# !pip install beautifulsoup4
# !pip install pandas
# !pip install openpyxl # May be included in pandas installation

In [18]:
# Clear variables
%reset

In [19]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [20]:
top_level_dict = {
    "Keynote sessions": {
        "Union Symposia (US)": {
            "url": "https://www.egu26.eu/pg/US",
        },
        "Great Debates (GDB)": {
            "url": "https://www.egu26.eu/pg/GDB",
        },
        "Medal & Award Lectures (MAL)": {
            "url": "https://www.egu26.eu/pg/MAL",
        },
    },
    "Union-wide events": {
        "Short Courses (SC)": {
            "url": "https://www.egu26.eu/pg/SC",
        },
        "Education and Outreach Sessions (EOS)": {
            "url": "https://www.egu26.eu/pg/EOS",
        },
    },
    "Community-led": {
        "Splinter meetings": {
            "url": "https://webforms.copernicus.org/EGU26/splinter-meetings",
        },
        "Lectures organized by related scientific societies (LRS)": {
            "url": "https://www.egu26.eu/pg/LRS",
        },
    },
    "Inter- and Transdisciplinary Sessions": {
        "Digital Geosciences": {
            "url": "https://www.egu26.eu/pg/ITS1",
        },
        "Impacts of Climate and Weather in an Inter-and Transdisciplinary context": {
            "url": "https://www.egu26.eu/pg/ITS2",
        },
        "Environment and Society in Geosciences": {
            "url": "https://www.egu26.eu/pg/ITS3",
        },
        "Risk, Resilience, Mitigation and Adaptation": {
            "url": "https://www.egu26.eu/pg/ITS4",
        },
        "General ITS sessions": {
            "url": "https://www.egu26.eu/pg/ITS5",
        },
    },
    "Disciplinary sessions": {
        "Atmospheric Sciences (AS)": {
            "url": "https://www.egu26.eu/pg/AS",
            "num_special": 3,
        },
        "Biogeosciences (BG)": {
            "url": "https://www.egu26.eu/pg/BG",
            "num_special": 2,
        },
        "Climate: Past, Present & Future (CL)": {
            "url": "https://www.egu26.eu/pg/CL",
            "num_special": 2,
        },
        "Cryospheric Sciences (CR)": {
            "url": "https://www.egu26.eu/pg/CR",
            "num_special": 2,
        },
        "Earth Magnetism & Rock Physics (EMRP)": {
            "url": "https://www.egu26.eu/pg/EMRP",
            "num_special": 2,
        },
        "Energy, Resources and the Environment (ERE)": {
            "url": "https://www.egu26.eu/pg/ERE",
            "num_special": 2,
        },
        "Earth & Space Science Informatics (ESSI)":{
            "url": "https://www.egu26.eu/pg/ESSI",
            "num_special": 1,
        },
        "Geodesy (G)": {
            "url": "https://www.egu26.eu/pg/G",
            "num_special": 1,
        },
        "Geodynamics (GD)": {
            "url": "https://www.egu26.eu/pg/GD",
            "num_special": 2,
        },
        "Geosciences Instrumentation & Data Systems (GI)": {
            "url": "https://www.egu26.eu/pg/GI",
            "num_special": 3,
        },
        "Geomorphology (GM)": {
            "url": "https://www.egu26.eu/pg/GM",
            "num_special": 5,
        },
        "Geochemistry, Mineralogy, Petrology & Volcanology (GMPV)": {
            "url": "https://www.egu26.eu/pg/GMPV",
            "num_special": 2,
        },
        "Hydrological Sciences (HS)": {
            "url": "https://www.egu26.eu/pg/HS",
            "num_special": 4,
        },
        "Natural Hazards (NH)":{
            "url": "https://www.egu26.eu/pg/NH",
            "num_special": 6,
        },
        "Nonlinear Processes in Geosciences (NP)": {
            "url": "https://www.egu26.eu/pg/NP",
            "num_special": 2,
        },
        "Ocean Sciences (OS)": {
            "url": "https://www.egu26.eu/pg/OS",
            "num_special": 2,
        },
        "Planetary & Solar System Sciences (PS)": {
            "url": "https://www.egu26.eu/pg/PS",
            "num_special": 6,
        },
        "Seismology (SM)": {
            "url": "https://www.egu26.eu/pg/SM",
            "num_special": 3,
        },
        "Stratigraphy, Sedimentology & Palaeontology (SSP)": {
            "url": "https://www.egu26.eu/pg/SSP",
            "num_special": 3,
        },
        "Soil System Sciences (SSS)":{
            "url": "https://www.egu26.eu/pg/SSS",
            "num_special": 4,
        },
        "Solar-Terrestrial Sciences (ST)":{
            "url": "https://www.egu26.eu/pg/ST",
            "num_special": 2,
        },
        "Tectonics & Structural Geology (TS)": {
            "url": "https://www.egu26.eu/pg/TS",
            "num_special": 2,
        },
    }
}

In [21]:
# Only works for disciplinary

def get_prog_dict(top_level_dict, dict_item):

    programme_dict = {}

    for programme, item in top_level_dict[dict_item].items():

        print(programme)

        page = requests.get(item['url'])
        soup = BeautifulSoup(page.content, 'html.parser')


        # Add to programme dict
        for pg in soup.find_all(class_='s2simulatorSecondCol'):
            pg_items = pg.get_text(strip=True).split('–')
            pg_code = pg_items[0].strip()
            pg_title = pg_items[1].strip()

            programme_dict[pg_code] = pg_title

    return programme_dict

In [22]:
programme_dict = get_prog_dict(top_level_dict, "Disciplinary sessions")

Atmospheric Sciences (AS)
Biogeosciences (BG)
Climate: Past, Present & Future (CL)
Cryospheric Sciences (CR)
Earth Magnetism & Rock Physics (EMRP)
Energy, Resources and the Environment (ERE)
Earth & Space Science Informatics (ESSI)
Geodesy (G)
Geodynamics (GD)
Geosciences Instrumentation & Data Systems (GI)
Geomorphology (GM)
Geochemistry, Mineralogy, Petrology & Volcanology (GMPV)
Hydrological Sciences (HS)
Natural Hazards (NH)
Nonlinear Processes in Geosciences (NP)
Ocean Sciences (OS)
Planetary & Solar System Sciences (PS)
Seismology (SM)
Stratigraphy, Sedimentology & Palaeontology (SSP)
Soil System Sciences (SSS)
Solar-Terrestrial Sciences (ST)
Tectonics & Structural Geology (TS)


In [23]:
programme_dict

{'AS1': 'Meteorology',
 'AS2': 'Boundary Layer Processes',
 'AS3': 'Atmospheric Composition, Chemistry and Aerosols',
 'AS4': 'Interdisciplinary Processes',
 'AS5': 'Methods and Techniques',
 'AS6': 'Short Courses',
 'BG1': 'General Biogeosciences',
 'BG2': 'Methods in Biogeosciences',
 'BG3': 'Terrestrial Biogeosciences',
 'BG4': 'Marine and Freshwater Biogeosciences',
 'BG5': 'Palaeobiogeosciences',
 'BG6': 'Geomicrobiomes and their function',
 'BG7': 'Extraterrestrial and Extreme Environment Biogeosciences',
 'BG8': 'Biogeosciences, Policy and Society',
 'BG9': 'Earth System Remote Sensing and Modelling',
 'BG10': 'Interdisciplinary topics in Biogeosciences',
 'CL0': 'Inter- and Transdisciplinary Sessions',
 'CL1.1': 'Past Climate - Deep Time',
 'CL1.2': 'Past Climate - Last ~2.6 Ma',
 'CL2': 'Present Climate',
 'CL3.1': 'Future Climate',
 'CL3.2': 'Future Climate',
 'CL4': 'Climate Studies Across Timescales',
 'CL5': 'Tools for Climate Studies',
 'CL6': 'Short Courses',
 'CR1': 'Th

In [24]:

def get_data(top_level_dict, dict_item, programme_dict):

    data = {
        'Category': [],
        'Top Level Session': [],
        'Programme Code': [],
        'Programme Title': [],
        'Programme URL': [],
        'Session Code': [],
        'Session Title': [],
        'Session Co-Organizer': [],
        'Session Conveners': [],
        'Session Orals Schedule': [],
        'Session Posters Onsite Attendance': [],
        'Session Posters Onsite Display': [],
        'Session Posters Virtual Attendance': [],
        'Session Posters Virtual Display': [],
        'Session PICO Schedule': [],
        'Session Other Schedule': []
    }

    for programme, item in top_level_dict[dict_item].items():

        print('\n', programme)

        page = requests.get(item['url'])
        soup = BeautifulSoup(page.content, 'html.parser')

        if 'num_special' in item:
            num_special = item['num_special']
        else:
            num_special = 0

        for block in soup.find_all(class_='co_mto_programme-session-block mb-0')[num_special:]:

            block_code = block.find_all(class_='co_mto_programme-session-block-number-number')[0].get_text(strip=True)
            block_title = ''
            try:
                # block_title = block.find_all(class_='co_mto_programme-session-block-title active')[0].get_text(strip=True)
                block_title = block.find_all(class_='co_mto_programme-session-block-title')[0].get_text(strip=True)
            except:
                print("No block title")

            print(block_code, ' - ', block_title)

            try:
                block_co_organizer = block.find_all(class_='co_mto_programme-session-block-cosponsoring-coorganizing')[0].get_text(strip=True)
            except:
                block_co_organizer = ''

            block_conveners = block.find_all(class_='co_mto_programme-session-block-so')[0].get_text(strip=True)

            orals_schedule = ''
            posters_onsite_attendance = ''
            posters_onsite_display = ''
            posters_virtual_attendance = ''
            posters_virtual_display = ''
            pico_schedule = ''
            other_schedule = ''

            for schedule in block.find_all(class_='mo_scheduling_string'):

                sch_text = schedule.get_text(strip=True)

                if "Orals|Enter live session" in sch_text:
                    orals_schedule = sch_text.split("Orals|Enter live session")[1].strip()
                elif "Posters on site|Attendance" in sch_text:
                    t1 = sch_text.split("|")
                    posters_onsite_attendance = t1[1].split("Attendance")[1].strip()
                    posters_onsite_display = t1[2].split("Display")[1].strip()
                elif "Posters virtual|Enter live session" in sch_text:
                    t2 = sch_text.split("|")
                    posters_virtual_attendance = t2[1].split("Attendance")[1].strip()
                    posters_virtual_display = t2[2].split("Display")[1].strip()
                elif "PICO|Enter live session" in sch_text:
                    pico_schedule = sch_text.split("PICO|Enter live session")[1].strip()
                else:
                    print(sch_text)
                    other_schedule = sch_text.strip()

            data['Category'].append(dict_item)
            data['Top Level Session'].append(programme)

            prog_code = block_code.split('.')[0]
            data['Programme Code'].append(prog_code)
            try:
                data['Programme Title'].append(programme_dict[prog_code])
            except:
                print("Cannot find Programme Code:", prog_code)
                data['Programme Title'].append('')

            data['Programme URL'].append(item['url'])
            data['Session Code'].append(block_code)
            data['Session Title'].append(block_title)
            data['Session Co-Organizer'].append(block_co_organizer)
            data['Session Conveners'].append(block_conveners)
            data['Session Orals Schedule'].append(orals_schedule)
            data['Session Posters Onsite Attendance'].append(posters_onsite_attendance)
            data['Session Posters Onsite Display'].append(posters_onsite_display)
            data['Session Posters Virtual Attendance'].append(posters_virtual_attendance)
            data['Session Posters Virtual Display'].append(posters_virtual_display)
            data['Session PICO Schedule'].append(pico_schedule)
            data['Session Other Schedule'].append(other_schedule)

    return data
        

In [25]:
# page = requests.get( top_level_dict['Disciplinary sessions']['Atmospheric Sciences (AS)'] )

In [26]:
# soup = BeautifulSoup(page.content, 'html.parser')

In [27]:
# programme_dict = {}

# for pg in soup.find_all(class_='s2simulatorSecondCol'):
#     pg_items = pg.get_text(strip=True).split('–')
#     pg_code = pg_items[0].strip()
#     pg_title = pg_items[1].strip()

#     programme_dict[pg_code] = pg_title

In [28]:
# for block in soup.find_all(class_='co_mto_programme-session-block mb-0')[2:]:

#     block_code = block.find_all(class_='co_mto_programme-session-block-number-number')[0].get_text(strip=True)
#     block_title = block.find_all(class_='co_mto_programme-session-block-title active')[0].get_text(strip=True)

#     try:
#         block_co_organizer = block.find_all(class_='co_mto_programme-session-block-cosponsoring-coorganizing')[0].get_text(strip=True)
#     except:
#         block_co_organizer = ''

#     block_conveners = block.find_all(class_='co_mto_programme-session-block-so')[0].get_text(strip=True)

#     for schedule in block.find_all(class_='mo_scheduling_string'):

#         sch_text = schedule.get_text(strip=True)

#         if "Orals|Enter live session" in sch_text:
#             orals_schedule = sch_text.split("Orals|Enter live session")[1].strip()
#         elif "Posters on site|Attendance" in sch_text:
#             t1 = sch_text.split("|")
#             posters_onsite_attendance = t1[1].split("Attendance")[1].strip()
#             posters_onsite_display = t1[2].split("Display")[1].strip()
#         elif "Posters virtual|Enter live session" in sch_text:
#             t2 = sch_text.split("|")
#             posters_virtual_attendance = t1[1].split("Attendance")[1].strip()
#             posters_virtual_display = t1[2].split("Display")[1].strip()
#         elif "PICO|Enter live session" in sch_text:
#             pico_schedule = sch_text.split("PICO|Enter live session")[1].strip()
#         else:
#             print(sch_text)
#             other_schedule = sch_text.split("Enter live session")[1].strip()

#     data['Session Code'].append(block_code)
#     data['Session Title'].append(block_title)
#     data['Session Co-Organizer'].append(block_co_organizer)
#     data['Session Conveners'].append(block_conveners)
#     data['Session Orals Schedule'].append(orals_schedule)
#     data['Session Posters Onsite Attendance'].append(posters_onsite_attendance)
#     data['Session Posters Onsite Display'].append(posters_onsite_display)
#     data['Session Posters Virtual Attendance'].append(posters_virtual_attendance)
#     data['Session Posters Virtual Display'].append(posters_virtual_display)

In [29]:
def check_day(text, day, existing_value):
    if existing_value == 1:
        return 1
    else:
        return 1 if day in text else 0

def get_df(data):
    df = pd.DataFrame(data)

    # List of days of the week
    days_of_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri']

    for day in days_of_week:
        df[day] = 0

    # Create new columns for each day of the week
    for day in days_of_week:
        for entry in ['Session Orals Schedule', 'Session Posters Onsite Attendance', 'Session Posters Onsite Display', 'Session Posters Virtual Attendance', 'Session Posters Virtual Display', 'Session PICO Schedule', 'Session Other Schedule']:
            df[day] = df.apply(lambda row: check_day(row[entry], day, row[day]), axis=1)
    
    return df

In [30]:
data_keynotes = get_data(top_level_dict, 'Keynote sessions', programme_dict)
df_keynotes = get_df(data_keynotes)

data_union_wide = get_data(top_level_dict, 'Union-wide events', programme_dict)
df_union_wide = get_df(data_union_wide)

data_community = get_data(top_level_dict, 'Community-led', programme_dict)
df_community = get_df(data_community)

data_trans = get_data(top_level_dict, 'Inter- and Transdisciplinary Sessions', programme_dict)
df_trans = get_df(data_trans)

data_disciplinary = get_data(top_level_dict, 'Disciplinary sessions', programme_dict)
df_disciplinary = get_df(data_disciplinary)


 Union Symposia (US)
US1  -  Redefining excellence and academic career pathways in the geosciences
Cannot find Programme Code: US1
US2  -  Two Faces of Earth: Hazards and Resources
Cannot find Programme Code: US2
US3  -  Seas of the Sun: The Cluster Story
Cannot find Programme Code: US3
US4  -  Towards a new RoadMap for Environmental Stewardship and Sustainability in Space Exploration
Cannot find Programme Code: US4
US5  -  Have we broken the Earth’s water cycle? Pathways from Planetary Boundaries to local Safe Operating Spaces
Cannot find Programme Code: US5
US6  -  Climate change, morals, values and policies
Cannot find Programme Code: US6
US7  -  Evidence-based policymaking in an era of increasing climate hazards and risks.
Cannot find Programme Code: US7
US8  -  “Greennovation”: how can scientists support the green transition
Cannot find Programme Code: US8
US9  -  Methane at 250 – History,  Sources, Sinks and Climate Feedbacks.
Cannot find Programme Code: US9
US10  -  The Niscemi

In [31]:
df_final = pd.concat([df_keynotes, df_union_wide, df_community, df_trans, df_disciplinary], ignore_index=True)

In [32]:
df_final.to_excel('egu_schedule.xlsx', index=False)