In [23]:
# found a repository with demo HTML files for IPCC glossary!
url = "https://raw.githubusercontent.com/petermr/semanticClimate/main/ipcc/ar6/test/total_glossary/new_total_demo.html"

# lets fetch it and save it locally
output_file = "ipcc_demo.html"
import requests
# reference: https://www.w3schools.com/python/module_requests.asp, it used to fetch data form webpages

response = requests.get(url)
with open(output_file, "w", encoding="utf-8") as file:
    file.write(response.text)
        
    print(f"Saved to '{output_file}'")


Saved to 'ipcc_demo.html'


Cleaning the data

In [24]:
# Lets make a clean function to parse the HTML and extract concepts and definitions
# According to AI, Beatifulsoup is a great library for parsing HTML documents
from bs4 import BeautifulSoup
# used reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

def parse_ipcc_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    concepts = []

    # looking at the file, each concept is in a container with class 'gloss-term'. So lets gather all these containers and ignore the rest
    containers = soup.find_all('div', class_='gloss-term')
    print(f"Found {len(containers)} term containers.")

    # looking at the structure, the concepts names are in h4 tags
    # So lets find get them
    for container in containers:
        concept_element = container.find('h4')
        if not concept_element: #added later
            continue    # skip if no h4 found # added later
        concept_name = concept_element.get_text(strip=True) # throws away all the HTML tags

        # inside the containers, definitions are in class 'definition' but with alot of nested tags:
        # they also go on for multiple paragraphs, and we have to get all the text inside
        def_tag = container.find('p', class_='definition')


        #________________________________________________________________________________________________________________________________________
        #HOWEVER, THE HTML is inconsistent and not all definitions start with <p class='definition'>, one for example start with <div class="fs-6 p-2 mb-0"
        #SO WE HAVE TO CATCH THAT CASE TOO
        if not def_tag:
            def_tag = container.find('p')  # finding the first <p> tag
        #________________________________________________________________________________________________________________________________________


        if not def_tag:
            print(f"SKIPPING: '{concept_name}'")   #broken HTML case handling
            continue

        definition_text = def_tag.get_text(" ", strip=True)

        # .find only gets the first matching tag, but we need all the paragraphs inside the definition
        # So we have to go until we reach a </div> tag, gathering all the <p> tags with class 'definition'
        for next_p in def_tag.find_next_siblings(): #.find_next_siblings is a function thats reads like a human, one sentence at a time
            if next_p.name == 'p':
                # append this paragraph's text to the definition_text
                definition_text += " " + next_p.get_text(" ", strip=True)
            # but if we reach </div> tag, we stop
            elif next_p.name == 'div':
                break
            

        concepts.append({'concept': concept_name, 'definition': definition_text})

    return concepts

In [25]:
# excecuting the function on the saved HTML file
with open(output_file, "r", encoding="utf-8") as file:
    html_content = file.read()
    concepts = parse_ipcc_html(html_content)

# saving the concepts to a CSV file
import csv
csv_file = "ipcc_concepts.csv"
with open(csv_file, "w", newline="", encoding="utf-8-sig") as file:  #The encoding utf-8-sig is to make sure excel reads it properly
    writer = csv.DictWriter(file, fieldnames=["concept", "definition"])
    writer.writeheader()
    writer.writerows(concepts)

# and saving it a JSON file because its easy to read and use later
import json
#Reference for using json.dump https://www.geeksforgeeks.org/python/json-dump-in-python/

json_file = "ipcc_concepts.json"
with open(json_file, "w", encoding="utf-8") as file:
    json.dump(concepts, file, ensure_ascii=False, indent=2) #ensure_ascii=False, causes non-ascii characters to be saved properly
print(f"extracted {len(concepts)} terms.")

Found 915 term containers.


PermissionError: [Errno 13] Permission denied: 'ipcc_concepts.csv'

In [None]:
# lets see if its gone well
for i in range(915):
    print(f"Name: {concepts[i]['concept']}")
    print(f"Definition: {concepts[i]['definition']}")
    print("-----")

Name: ablation
Definition: All processes that reduce the mass of a glacier , ice sheet , or snow cover. The main processes are melting, and for glaciers also calving (or, when the glacier nourishes an ice shelf , discharge of ice across the grounding line ), but other processes such as sublimation and loss of wind-blown snow can also contribute to ablation. Ablation also refers to the mass lost by any of these processes.
-----
Name: abrupt change
Definition: A change in the system that is substantially faster than the typical rate of the changes in its history.
-----
Name: abrupt climate change
Definition: A large-scale abrupt change in the climate system that takes place over a few decades or less, persists (or is anticipated to persist) for at least a few decades and causes substantial impacts in human and/or natural systems .
-----
Name: acceptability of policy or system change
Definition: The extent to which a policy or system change is evaluated unfavourably or favourably, or reje