In [None]:
import json

with open('countries.json', 'r') as file:
    country_lookup = json.load(file)

languages = ['en', 'fr', 'es', 'ru']
url_template = "https://gadebate.un.org/sites/default/files/gastatements/79/{code}_{lang}.pdf"

print(f"Total countries: {len(country_lookup)}")

Total countries: 246


In [None]:
%pip install requests -q

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import requests
import time

os.makedirs('./speeches', exist_ok=True)

for code in country_lookup.keys():
    for lang in languages:
        url = url_template.format(code=code.lower(), lang=lang)
        try:
            response = requests.get(url)
            response.raise_for_status()
            filename = os.path.join('./speeches', f"{code.lower()}_{lang}.pdf")
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {filename}")
            time.sleep(0.2)
            break  # Exit the language loop if download is successful
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")
    else:
        print(f"Could not download speech for country {code} in any language.")
        
print("Download complete.")

Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_fr.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_fr.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_es.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_es.pdf
Could not download speech for country AF in any language.
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ax_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/ax_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ax_fr.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sit

In [6]:
%pip install pytesseract -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pytesseract
from pdf2image import convert_from_path
import glob
import os

tesseract_lang_codes = {'en': 'eng', 'fr': 'fra', 'es': 'spa', 'ru': 'rus'}

pdf_files = glob.glob('./speeches/*.pdf')

for pdf_file in pdf_files:
    txt_file = pdf_file.replace('.pdf', '.txt')
    
    if os.path.exists(txt_file):
        print(f"{txt_file} already exists. Skipping conversion.")
        continue
    
    filename = os.path.basename(pdf_file)
    parts = filename.split('_')
    if len(parts) >= 2:
        lang_code = parts[-1].split('.')[0]
        tesseract_lang = tesseract_lang_codes.get(lang_code, 'eng')
    else:
        tesseract_lang = 'eng'
    
    images = convert_from_path(pdf_file)
    
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, lang=tesseract_lang)
    
    with open(txt_file, 'w') as f:
        f.write(text)
    
    print(f"Converted {pdf_file} to {txt_file}")
    
print("Conversion complete.")

./speeches/sc_en.txt already exists. Skipping conversion.
Converted ./speeches/ht_fr.pdf to ./speeches/ht_fr.txt
Converted ./speeches/mx_es.pdf to ./speeches/mx_es.txt
./speeches/so_en.txt already exists. Skipping conversion.
./speeches/gm_en.txt already exists. Skipping conversion.
./speeches/ee_en.txt already exists. Skipping conversion.
./speeches/tz_en.txt already exists. Skipping conversion.
./speeches/dk_en.txt already exists. Skipping conversion.
./speeches/tv_en.txt already exists. Skipping conversion.
./speeches/gy_en.txt already exists. Skipping conversion.
./speeches/bd_en.txt already exists. Skipping conversion.
./speeches/az_en.txt already exists. Skipping conversion.
./speeches/kp_en.txt already exists. Skipping conversion.
Converted ./speeches/bj_fr.pdf to ./speeches/bj_fr.txt
Converted ./speeches/cu_es.pdf to ./speeches/cu_es.txt
Converted ./speeches/cd_fr.pdf to ./speeches/cd_fr.txt
./speeches/na_en.txt already exists. Skipping conversion.
Converted ./speeches/bf_fr.pd

In [8]:
%pip install openai python-dotenv  -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [17]:
from db import Database
import glob
from llm import LLM
from pydantic import BaseModel, Field
from typing import List, Dict
import os

llm = LLM()
db = Database()

SYSTEM_PROMPT = "You are an expert in analyzing speeches for mentions of other countries."

USER_PROMPT_TEMPLATE = (
    "Read the provided speech text carefully. Your task is to determine whether the speech is optimistic or pessimistic about the country's future. Optimistic means that the speech is expressing confidence that things are improving or are good. Pessimistic means that the speech is expressing worry that things are getting worse or are pretty bad. Present your findings as a JSON object with the fields 'sentiment' (either 'optimistic' or 'pessimistic') and 'explanation' using markdown. Include in the explanation quotations from the speech to support the sentiment.\n\n{speech_content}"
)

class CountryMention(BaseModel):
    country: str = Field(..., description="The name of the mentioned country.")
    country_code: str = Field(..., description="The two letter ISO code of the mentioned country.")
    sentiment: str = Field(..., description="The sentiment of the mention (optimistic or pessimistic).")
    explanation: str = Field(..., description="An explanation of how the country was mentioned.")

class CountryMentions(BaseModel):
    mentions: List[CountryMention] = Field(..., description="An array of objects representing the country mentions and their sentiments.")

def create_table():
    db.execute("""
    CREATE TABLE IF NOT EXISTS country_mentions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        country_code TEXT NOT NULL,
        mentioned_country TEXT NOT NULL,
        mentioned_country_code TEXT NOT NULL,
        sentiment TEXT NOT NULL,
        explanation TEXT NOT NULL
    );
    """)

def get_txt_files(directory='./speeches/*.txt') -> Dict[str, List[str]]:
    files = glob.glob(directory)
    country_files = {}
    for file in files:
        filename = os.path.basename(file)
        parts = filename.split('_')
        if len(parts) >= 2:
            country_code = parts[0]
            lang_code = parts[1].split('.')[0]
            if country_code not in country_files:
                country_files[country_code] = []
            country_files[country_code].append((lang_code, file))
    return country_files

def read_speech_content(file_path):
    with open(file_path, 'r') as file:
        return file.read()

def generate_country_mentions(speech_content):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(speech_content=speech_content)}
    ]
    return llm.generate(messages, CountryMentions)

def store_country_mentions(country_code, country_mentions):
    for mention in country_mentions.mentions:
        db.execute("INSERT INTO country_mentions (country_code, mentioned_country, mentioned_country_code, sentiment, explanation) VALUES (?, ?, ?, ?, ?)", 
                   (country_code, mention.country, mention.country_code, mention.sentiment, mention.explanation))

def analysis_exists(country_code):
    db.execute("SELECT 1 FROM country_mentions WHERE country_code = ? LIMIT 1", (country_code,))
    return db.fetchone() is not None

create_table()
country_files_dict = get_txt_files()

for country_code, files in country_files_dict.items():
    if analysis_exists(country_code):
        print(f"Analysis for country {country_code} already exists. Skipping.")
        continue

    # Prioritize English files
    files_sorted = sorted(files, key=lambda x: 0 if x[0] == 'en' else 1)
    for lang_code, file_path in files_sorted:
        speech_content = read_speech_content(file_path)
        if speech_content.strip():  # Check if content is not empty
            response = generate_country_mentions(speech_content)
            print(f"Generated country mentions for country {country_code} ({lang_code})")
            store_country_mentions(country_code, response)
            print(f"Stored country mentions for country {country_code}")
            break  # Analysis done for this country
    else:
        print(f"No valid speech content found for country {country_code}")

print("Analysis complete.")

Analysis for country dm already exists. Skipping.
Analysis for country qa already exists. Skipping.
Analysis for country bb already exists. Skipping.
Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country rs (en)
Stored country mentions for country rs
Analysis for country ne already exists. Skipping.
Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country bz (en)
Stored country mentions for country bz
Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country fi (en)
Stored country mentions for country fi
Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country se (en)
Stored country mentions for country se
Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country mc (fr)
Stored country mentions for country mc
Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country th (en)

KeyboardInterrupt: 

In [18]:
from db import Database
import json
from typing import Dict, List, Tuple

def get_mentions_by_target(db: Database, sentiment: str) -> List[Tuple]:
    query = """
    SELECT 
        mentioned_country_code,
        mentioned_country,
        country_code,
        explanation 
    FROM 
        country_mentions 
    WHERE 
        sentiment = ?;
    """
    return db.execute(query, (sentiment,)).fetchall()

def process_mentions_by_target(results: List[Tuple]) -> Dict:
    return {
        row[0]: [
            {
                "mentioning_country_code": r[2].upper(),
                "explanation": r[3]
            }
            for r in results if r[0] == row[0]
        ]
        for row in results
    }

# Initialize database
db = Database()

# Get and process mentions
optimistic_received = process_mentions_by_target(get_mentions_by_target(db, 'optimistic'))
pessimistic_received = process_mentions_by_target(get_mentions_by_target(db, 'pessimistic'))

# Display results
print("Optimistic Mentions Received:")
print(json.dumps(optimistic_received, indent=4))
print("\nPessimistic Mentions Received:")
print(json.dumps(pessimistic_received, indent=4))

# Save to files
for filename, data in [
    ('public/optimistic_received.json', optimistic_received),
    ('public/pessimistic_received.json', pessimistic_received)
]:
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)

Optimistic Mentions Received:
{
    "HT": [
        {
            "mentioning_country_code": "DM",
            "explanation": "The speech expresses hope for Haiti's future, stating, 'there is a glimmer of hope, the dawning of a new day, as progress is being made.' This indicates a positive outlook regarding the situation in Haiti."
        }
    ],
    "YE": [
        {
            "mentioning_country_code": "QA",
            "explanation": "The Amir expresses hope for peace in Yemen, saying: \"we look forward to preserving the 2022 truce and proceeding therefrom towards a comprehensive ceasefire, resolving the crisis, ensuring Yemen's unity and achieving the aspirations of its brotherly people.\""
        }
    ],
    "SY": [
        {
            "mentioning_country_code": "QA",
            "explanation": "There is a hopeful tone concerning Syria's future, as the Amir mentions: \"hope that the parties and countries involved in the crisis will be convinced of the necessity for dialogu