In [10]:
import json

with open("data/resources/countries.json", "r") as file:
    country_lookup = json.load(file)

languages = ["en", "fr", "es", "ru"]
url_template = (
    "https://gadebate.un.org/sites/default/files/gastatements/79/{code}_{lang}.pdf"
)

print(f"Total countries: {len(country_lookup)}")

Total countries: 247


In [11]:
import os
import time

import requests

PDFS_DIR = "data/raw/pdfs"

os.makedirs(PDFS_DIR, exist_ok=True)

for code in country_lookup.keys():
    for lang in languages:
        url = url_template.format(code=code.lower(), lang=lang)
        try:
            response = requests.get(url)
            response.raise_for_status()
            filename = os.path.join(PDFS_DIR, f"{code.lower()}_{lang}.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {filename}")
            time.sleep(0.2)
            break  # Exit the language loop if download is successful
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")
    else:
        print(f"Could not download speech for country {code} in any language.")

print("Download complete.")

Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_fr.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_fr.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_es.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_es.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_ru.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_ru.pdf
Could not download speech for country AF in any language.
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ax_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sit

KeyboardInterrupt: 

In [None]:
import glob
import os

import pytesseract
from pdf2image import convert_from_path

tesseract_lang_codes = {"en": "eng", "fr": "fra", "es": "spa", "ru": "rus"}

TEXT_DIR = "data/processed/text"

pdf_files = glob.glob(os.path.join(PDFS_DIR, "*.pdf"))

for pdf_file in pdf_files:
    txt_filename = pdf_file.replace(".pdf", ".txt")
    txt_file = os.path.join(TEXT_DIR, os.path.basename(txt_filename))

    if os.path.exists(txt_file):
        print(f"{txt_file} already exists. Skipping conversion.")
        continue

    filename = os.path.basename(pdf_file)
    parts = filename.split("_")
    if len(parts) >= 2:
        lang_code = parts[-1].split(".")[0]
        tesseract_lang = tesseract_lang_codes.get(lang_code, "eng")
    else:
        tesseract_lang = "eng"

    images = convert_from_path(pdf_file)

    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, lang=tesseract_lang)

    with open(txt_file, "w") as f:
        f.write(text)

    print(f"Converted {pdf_file} to {txt_file}")

print("Conversion complete.")

data/processed/text/sc_en.txt already exists. Skipping conversion.
data/processed/text/ht_fr.txt already exists. Skipping conversion.
data/processed/text/mx_es.txt already exists. Skipping conversion.
data/processed/text/so_en.txt already exists. Skipping conversion.
data/processed/text/gm_en.txt already exists. Skipping conversion.
data/processed/text/ee_en.txt already exists. Skipping conversion.
data/processed/text/tz_en.txt already exists. Skipping conversion.
data/processed/text/dk_en.txt already exists. Skipping conversion.
data/processed/text/tv_en.txt already exists. Skipping conversion.
data/processed/text/gy_en.txt already exists. Skipping conversion.
data/processed/text/bd_en.txt already exists. Skipping conversion.
data/processed/text/az_en.txt already exists. Skipping conversion.
data/processed/text/kp_en.txt already exists. Skipping conversion.
data/processed/text/bj_fr.txt already exists. Skipping conversion.
data/processed/text/cu_es.txt already exists. Skipping convers

In [12]:
import glob
import os
from typing import Dict, List

from pydantic import BaseModel, Field

from db import Database
from llm import LLM

llm = LLM()
db = Database()

SYSTEM_PROMPT = (
    "You are an expert in analyzing speeches for mentions of other countries."
)

USER_PROMPT_TEMPLATE = "Read the provided speech text carefully. Your task is to determine whether the speech is optimistic or pessimistic about the country's future. Optimistic means that the speech is expressing confidence that things are improving or are good. Pessimistic means that the speech is expressing worry that things are getting worse or are pretty bad. Make sure to only include real countries' iso codes (the United Nations, continents like Africa or NATO are NOT considered countries). Present your findings as a JSON object with the fields 'sentiment' (either 'optimistic' or 'pessimistic') and 'explanation' using markdown. Include in the explanation quotations in English from the speech to support the sentiment. Make sure to ALWAYS translate the quotations to English.\n\n{speech_content}"


class CountryMention(BaseModel):
    country: str = Field(..., description="The name of the mentioned country.")
    country_code: str = Field(
        ..., description="The two letter ISO code of the mentioned country."
    )
    sentiment: str = Field(
        ..., description="The sentiment of the mention (optimistic or pessimistic)."
    )
    explanation: str = Field(
        ..., description="An explanation of how the country was mentioned."
    )


class CountryMentions(BaseModel):
    mentions: List[CountryMention] = Field(
        ...,
        description="An array of objects representing the country mentions and their sentiments.",
    )


def create_table():
    db.execute("""
    CREATE TABLE IF NOT EXISTS country_mentions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        country_code TEXT NOT NULL,
        mentioned_country TEXT NOT NULL,
        mentioned_country_code TEXT NOT NULL,
        sentiment TEXT NOT NULL,
        explanation TEXT NOT NULL
    );
    """)


def get_txt_files(directory=os.path.join(TEXT_DIR, "*.txt")) -> Dict[str, List[str]]:
    files = glob.glob(directory)
    country_files = {}
    for file in files:
        filename = os.path.basename(file)
        parts = filename.split("_")
        if len(parts) >= 2:
            country_code = parts[0]
            lang_code = parts[1].split(".")[0]
            if country_code not in country_files:
                country_files[country_code] = []
            country_files[country_code].append((lang_code, file))
    return country_files


def read_speech_content(file_path):
    with open(file_path, "r") as file:
        return file.read()


def generate_country_mentions(speech_content):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": USER_PROMPT_TEMPLATE.format(speech_content=speech_content),
        },
    ]
    return llm.generate(messages, CountryMentions)


def store_country_mentions(country_code, country_mentions):
    for mention in country_mentions.mentions:
        db.execute(
            "INSERT INTO country_mentions (country_code, mentioned_country, mentioned_country_code, sentiment, explanation) VALUES (?, ?, ?, ?, ?)",
            (
                country_code,
                mention.country,
                mention.country_code,
                mention.sentiment,
                mention.explanation,
            ),
        )


def analysis_exists(country_code):
    db.execute(
        "SELECT 1 FROM country_mentions WHERE country_code = ? LIMIT 1", (country_code,)
    )
    return db.fetchone() is not None


create_table()
country_files_dict = get_txt_files()

for country_code, files in country_files_dict.items():
    if analysis_exists(country_code):
        print(f"Analysis for country {country_code} already exists. Skipping.")
        continue

    # Prioritize English files
    files_sorted = sorted(files, key=lambda x: 0 if x[0] == "en" else 1)
    for lang_code, file_path in files_sorted:
        speech_content = read_speech_content(file_path)
        if speech_content.strip():  # Check if content is not empty
            response = generate_country_mentions(speech_content)
            print(
                f"Generated country mentions for country {country_code} ({lang_code})"
            )
            store_country_mentions(country_code, response)
            print(f"Stored country mentions for country {country_code}")
            break  # Analysis done for this country
    else:
        print(f"No valid speech content found for country {country_code}")

print("Analysis complete.")

Generating response with model gpt-4o-mini
Generated country mentions for country dm (en)
Stored country mentions for country dm
Generating response with model gpt-4o-mini
Generated country mentions for country qa (en)
Stored country mentions for country qa
Generating response with model gpt-4o-mini
Generated country mentions for country bb (en)
Stored country mentions for country bb
Generating response with model gpt-4o-mini
Generated country mentions for country rs (en)
Stored country mentions for country rs
Generating response with model gpt-4o-mini
Generated country mentions for country ne (fr)
Stored country mentions for country ne
Generating response with model gpt-4o-mini
Generated country mentions for country bz (en)
Stored country mentions for country bz
Generating response with model gpt-4o-mini
Generated country mentions for country fi (en)
Stored country mentions for country fi
Generating response with model gpt-4o-mini
Generated country mentions for country se (en)
Stored 

In [13]:
import json
from typing import Dict, List, Tuple


def get_mentions_by_target(db: Database, sentiment: str) -> List[Tuple]:
    query = """
    SELECT 
        mentioned_country_code,
        mentioned_country,
        country_code,
        explanation 
    FROM 
        country_mentions 
    WHERE 
        sentiment = ?;
    """
    return db.execute(query, (sentiment,)).fetchall()


def process_mentions_by_target(results: List[Tuple]) -> Dict:
    return {
        row[0]: [
            {"mentioning_country_code": r[2].upper(), "explanation": r[3]}
            for r in results
            if r[0] == row[0]
        ]
        for row in results
    }


# Initialize database
db = Database()

# Get and process mentions
optimistic_received = process_mentions_by_target(
    get_mentions_by_target(db, "optimistic")
)
pessimistic_received = process_mentions_by_target(
    get_mentions_by_target(db, "pessimistic")
)

# Display results
print("Optimistic Mentions Received:")
print(json.dumps(optimistic_received, indent=4))
print("\nPessimistic Mentions Received:")
print(json.dumps(pessimistic_received, indent=4))

# Save to files
for filename, data in [
    ("website/data/optimistic_received.json", optimistic_received),
    ("website/data/pessimistic_received.json", pessimistic_received),
]:
    with open(filename, "w") as file:
        json.dump(data, file, indent=4)

Optimistic Mentions Received:
{
    "HT": [
        {
            "mentioning_country_code": "DM",
            "explanation": "The speech mentions a 'glimmer of hope' for the Haitian people and expresses optimism that 'ordinary Haitian citizens will once again be able to live, work, and feel at peace in Haiti,' indicating a positive perspective on the future."
        },
        {
            "mentioning_country_code": "GT",
            "explanation": "Guatemala expresses its willingness to support Haiti in restoring security and stability, saying, 'Guatemala expresses its willingness to support any effort aimed at restoring security and stability in Haiti.' This shows a commitment to positive engagement and support."
        },
        {
            "mentioning_country_code": "HT",
            "explanation": "The speech expresses hope for a better future for Haiti, stating, \"...Ha\u00efti se remettra plus forte, plus r\u00e9siliente et engagera, confiante, sa marche assur\u00e9e vers