In [2]:
import json


with open('countries.json', 'r') as file:
    countryList = json.load(file)["countries"]

url_template = "https://gadebate.un.org/sites/default/files/gastatements/79/{code}_en.pdf"
pdf_urls = [url_template.format(code=country['iso_code'].lower()) for country in countryList]

print(pdf_urls)

['https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ae_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ag_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/al_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/am_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ao_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/at_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/au_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/az_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ba_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/bb_en.pdf', 'https://gadebate.un.org/sites/default/files/gasta

In [3]:
%pip install requests -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import requests
import time

os.makedirs('./speeches', exist_ok=True)

for url in pdf_urls:
    try:
        response = requests.get(url)
        response.raise_for_status()
        filename = os.path.join('./speeches', url.split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
        time.sleep(0.2)
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")


Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf
Downloaded: ./speeches/ae_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf
Downloaded: ./speeches/ag_en.pdf
Downloaded: ./speeches/al_en.pdf
Downloaded: ./speeches/am_en.pdf
Downloaded: ./speeches/ao_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf
Downloaded: ./speeches/at_en.pdf
Downloaded: ./speeches/au_en.pdf
Downloaded: ./speeches/az_en.pdf
Downloaded: ./speeches/ba_en.pdf
Downloaded: ./speeches/bb_en.pdf
Downloaded: ./speeches/bd_en.pdf
Downloaded: ./speeches/be_en.pdf
Failed to dow

In [5]:
%pip install pytesseract -q

Note: you may need to restart the kernel to use updated packages.


In [9]:
import pytesseract
from pdf2image import convert_from_path
import glob
import os

# Get list of PDF files in the folder
pdf_files = glob.glob('./speeches/*.pdf')

for pdf_file in pdf_files:
    # Determine the corresponding .txt file name
    txt_file = pdf_file.replace('.pdf', '.txt')
    
    # Check if the .txt file already exists
    if os.path.exists(txt_file):
        print(f"{txt_file} already exists. Skipping conversion.")
        continue
    
    # Convert PDF to images
    images = convert_from_path(pdf_file)
    
    # Perform OCR on each image and collect the text
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    
    # Write the text to a file with the same name but .txt extension
    with open(txt_file, 'w') as f:
        f.write(text)
    
    print(f"Converted {pdf_file} to {txt_file}")

./speeches/sc_en.txt already exists. Skipping conversion.
./speeches/so_en.txt already exists. Skipping conversion.
./speeches/gm_en.txt already exists. Skipping conversion.
./speeches/ee_en.txt already exists. Skipping conversion.
./speeches/tz_en.txt already exists. Skipping conversion.
./speeches/dk_en.txt already exists. Skipping conversion.
./speeches/tv_en.txt already exists. Skipping conversion.
./speeches/gy_en.txt already exists. Skipping conversion.
./speeches/bd_en.txt already exists. Skipping conversion.
./speeches/az_en.txt already exists. Skipping conversion.
./speeches/kp_en.txt already exists. Skipping conversion.
./speeches/na_en.txt already exists. Skipping conversion.
./speeches/kh_en.txt already exists. Skipping conversion.
./speeches/mk_en.txt already exists. Skipping conversion.
./speeches/ls_en.txt already exists. Skipping conversion.
./speeches/om_en.txt already exists. Skipping conversion.
./speeches/za_en.txt already exists. Skipping conversion.
./speeches/in_

In [7]:
%pip install openai python-dotenv  -q

Note: you may need to restart the kernel to use updated packages.


In [11]:
import glob
from llm import LLM
from pydantic import BaseModel, Field
from typing import List
from db import Database

llm = LLM()
db = Database()

SYSTEM_PROMPT = "You are an expert in analyzing speeches for mentions of Sustainability Development Goals (SDGs)."
USER_PROMPT_TEMPLATE = (
    "Read the provided speech text carefully. Your task is to identify whether any of the United Nations Sustainable Development Goals (SDGs) are mentioned, "
    "either directly by name or implicitly through themes, initiatives, or issues discussed in the speech. For each SDG identified, provide a brief explanation "
    "of how it is referenced in the text. Consider both explicit mentions and contextual implications related to the SDG objectives. Present your findings as a "
    "JSON array of objects, where each object includes the SDG number and an explanation using markdown.:\n\n{speech_content}"
)

class SDGExplanation(BaseModel):
    sdg_number: int = Field(..., description="The number of the Sustainable Development Goal.")
    explanation: str = Field(..., description="An explanation of how the SDG was mentioned.")

class SDGExplanations(BaseModel):
    sdgs: List[SDGExplanation] = Field(..., description="An array of objects representing the Sustainable Development Goals and their explanations.")

def create_table():
    db.execute("""
    CREATE TABLE IF NOT EXISTS sdg_mentions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        country_code TEXT NOT NULL,
        sdg_number INTEGER NOT NULL,
        explanation TEXT NOT NULL
    );
    """)

def get_txt_files(directory='./speeches/*.txt'):
    return glob.glob(directory)

def read_speech_content(file_path):
    with open(file_path, 'r') as file:
        return file.read()

def generate_sdg_explanations(speech_content):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(speech_content=speech_content)}
    ]
    return llm.generate(messages, SDGExplanations)

def store_sdg_mentions(country_code, sdg_explanations):
    for sdg in sdg_explanations.sdgs:
        db.execute("INSERT INTO sdg_mentions (country_code, sdg_number, explanation) VALUES (?, ?, ?)", 
                   (country_code, sdg.sdg_number, sdg.explanation))

def analysis_exists(country_code):
    db.execute("SELECT 1 FROM sdg_mentions WHERE country_code = ?", (country_code,))
    return db.fetchone() is not None

create_table()
txt_files = get_txt_files()

if txt_files:
    for txt_file in txt_files:
        country_code = txt_file.split('/')[-1].split('_')[0]
        
        if analysis_exists(country_code):
            print(f"Analysis for country {country_code} already exists. Skipping.")
            continue

        speech_content = read_speech_content(txt_file)
        response = generate_sdg_explanations(speech_content)
        print(f"Generated SDG explanations for country {country_code}")
        store_sdg_mentions(country_code, response)
        print(f"Stored SDG mentions for country {country_code}")
else:
    print("No .txt files found in the speeches directory.")
    
print("Analysis complete.")

Analysis for country dm already exists. Skipping.
Analysis for country qa already exists. Skipping.
Analysis for country bb already exists. Skipping.
Analysis for country rs already exists. Skipping.
Analysis for country bz already exists. Skipping.
Generating response with model gpt-4o-mini-2024-07-18
Generated SDG explanations for country fi
Stored SDG mentions for country fi
Generating response with model gpt-4o-mini-2024-07-18
Generated SDG explanations for country se
Stored SDG mentions for country se
Generating response with model gpt-4o-mini-2024-07-18
Generated SDG explanations for country th
Stored SDG mentions for country th
Generating response with model gpt-4o-mini-2024-07-18
Generated SDG explanations for country si
Stored SDG mentions for country si
Generating response with model gpt-4o-mini-2024-07-18
Generated SDG explanations for country pw
Stored SDG mentions for country pw
Generating response with model gpt-4o-mini-2024-07-18
Generated SDG explanations for country ir

In [16]:
query = """
SELECT sdg_number, COUNT(*) as count
FROM sdg_mentions
GROUP BY sdg_number
ORDER BY count DESC;
"""

db.execute(query)
result = db.fetchall()

for row in result:
    print(f"SDG {row[0]}: {row[1]} mentions")


SDG 17: 111 mentions
SDG 13: 105 mentions
SDG 16: 101 mentions
SDG 1: 93 mentions
SDG 5: 70 mentions
SDG 10: 62 mentions
SDG 4: 60 mentions
SDG 3: 52 mentions
SDG 2: 35 mentions
SDG 11: 25 mentions
SDG 8: 21 mentions
SDG 14: 20 mentions
SDG 15: 19 mentions
SDG 9: 10 mentions
SDG 7: 10 mentions
SDG 6: 8 mentions
SDG 12: 4 mentions


In [17]:
query_sdg_17 = """
SELECT country_code, explanation
FROM sdg_mentions
WHERE sdg_number = 17;
"""

db.execute(query_sdg_17)
result_sdg_17 = db.fetchall()

for row in result_sdg_17:
    print(f"Country: {row[0]}, Explanation: {row[1]}")

Country: at, Explanation: The appeal for multilateralism and effective global governance reflects the spirit of SDG 17, which emphasizes partnerships for the goals, cooperation among countries to address global challenges.
Country: ae, Explanation: The speech calls for collective action and international cooperation on various issues, supporting the goal of strengthening the means of implementation and revitalizing the global partnership for sustainable development.
Country: ag, Explanation: The call for global cooperation and the need for support from the international community in addressing the vulnerabilities of Small Island Developing States relates to SDG 17, which focuses on strengthening the means of implementation and revitalizing the global partnership for sustainable development.
Country: ao, Explanation: Angola's focus on international cooperation to combat terrorism and promote development evokes SDG 17: Partnerships for the Goals. The call for collaborative efforts indica