In [2]:
import json


with open('countries.json', 'r') as file:
    countryList = json.load(file)["countries"]

url_template = "https://gadebate.un.org/sites/default/files/gastatements/79/{code}_en.pdf"
pdf_urls = [url_template.format(code=country['iso_code'].lower()) for country in countryList]

print(pdf_urls)

['https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ae_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ag_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/al_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/am_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ao_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/at_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/au_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/az_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ba_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/bb_en.pdf', 'https://gadebate.un.org/sites/default/files/gasta

In [3]:
%pip install requests -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import requests
import time

os.makedirs('./speeches', exist_ok=True)

for url in pdf_urls:
    try:
        response = requests.get(url)
        response.raise_for_status()
        filename = os.path.join('./speeches', url.split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
        time.sleep(0.2)
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")


Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf
Downloaded: ./speeches/ae_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf
Downloaded: ./speeches/ag_en.pdf
Downloaded: ./speeches/al_en.pdf
Downloaded: ./speeches/am_en.pdf
Downloaded: ./speeches/ao_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf
Downloaded: ./speeches/at_en.pdf
Downloaded: ./speeches/au_en.pdf
Downloaded: ./speeches/az_en.pdf
Downloaded: ./speeches/ba_en.pdf
Downloaded: ./speeches/bb_en.pdf
Downloaded: ./speeches/bd_en.pdf
Downloaded: ./speeches/be_en.pdf
Failed to dow

In [5]:
%pip install pytesseract -q

Note: you may need to restart the kernel to use updated packages.


In [9]:
import pytesseract
from pdf2image import convert_from_path
import glob
import os

# Get list of PDF files in the folder
pdf_files = glob.glob('./speeches/*.pdf')

for pdf_file in pdf_files:
    # Determine the corresponding .txt file name
    txt_file = pdf_file.replace('.pdf', '.txt')
    
    # Check if the .txt file already exists
    if os.path.exists(txt_file):
        print(f"{txt_file} already exists. Skipping conversion.")
        continue
    
    # Convert PDF to images
    images = convert_from_path(pdf_file)
    
    # Perform OCR on each image and collect the text
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    
    # Write the text to a file with the same name but .txt extension
    with open(txt_file, 'w') as f:
        f.write(text)
    
    print(f"Converted {pdf_file} to {txt_file}")

./speeches/sc_en.txt already exists. Skipping conversion.
./speeches/so_en.txt already exists. Skipping conversion.
./speeches/gm_en.txt already exists. Skipping conversion.
./speeches/ee_en.txt already exists. Skipping conversion.
./speeches/tz_en.txt already exists. Skipping conversion.
./speeches/dk_en.txt already exists. Skipping conversion.
./speeches/tv_en.txt already exists. Skipping conversion.
./speeches/gy_en.txt already exists. Skipping conversion.
./speeches/bd_en.txt already exists. Skipping conversion.
./speeches/az_en.txt already exists. Skipping conversion.
./speeches/kp_en.txt already exists. Skipping conversion.
./speeches/na_en.txt already exists. Skipping conversion.
./speeches/kh_en.txt already exists. Skipping conversion.
./speeches/mk_en.txt already exists. Skipping conversion.
./speeches/ls_en.txt already exists. Skipping conversion.
./speeches/om_en.txt already exists. Skipping conversion.
./speeches/za_en.txt already exists. Skipping conversion.
./speeches/in_

In [7]:
%pip install openai python-dotenv  -q

Note: you may need to restart the kernel to use updated packages.


In [1]:
import glob
from llm import LLM
from pydantic import BaseModel, Field
from typing import List
from db import Database
import random

llm = LLM()
db = Database()

SYSTEM_PROMPT = "You are an expert in analyzing speeches for mentions of other countries."
USER_PROMPT_TEMPLATE = (
    "Read the provided speech text carefully. Your task is to identify whether any other countries are mentioned, either explicitly by name or implicitly through references to their actions, policies, or characteristics. Make sure to only include real countries (the United Nations, continents like Africa or NATO are not considered countries). For each country mentioned, categorize if the speech is supportive or disproving of the current government of the country. Present your findings as a JSON array of objects, where each object includes the country two letter ISO code, the sentiment (supportive of disproving), and an explanation using markdown.\n\n{speech_content}"
)

class CountryMention(BaseModel):
    country: str = Field(..., description="The name of the mentioned country.")
    country_code: str = Field(..., description="The two letter ISO code of the mentioned country.")
    sentiment: str = Field(..., description="The sentiment of the mention (supportive or disproving).")
    explanation: str = Field(..., description="An explanation of how the country was mentioned.")

class CountryMentions(BaseModel):
    mentions: List[CountryMention] = Field(..., description="An array of objects representing the country mentions and their sentiments.")

def create_table():
    db.execute("""
    CREATE TABLE IF NOT EXISTS country_mentions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        country_code TEXT NOT NULL,
        mentioned_country TEXT NOT NULL,
        mentioned_country_code TEXT NOT NULL,
        sentiment TEXT NOT NULL,
        explanation TEXT NOT NULL
    );
    """)

def get_txt_files(directory='./speeches/*.txt'):
    return glob.glob(directory)

def read_speech_content(file_path):
    with open(file_path, 'r') as file:
        return file.read()

def generate_country_mentions(speech_content):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(speech_content=speech_content)}
    ]
    return llm.generate(messages, CountryMentions)

def store_country_mentions(country_code, country_mentions):
    for mention in country_mentions.mentions:
        db.execute("INSERT INTO country_mentions (country_code, mentioned_country, mentioned_country_code, sentiment, explanation) VALUES (?, ?, ?, ?, ?)", 
                   (country_code, mention.country, mention.country_code, mention.sentiment, mention.explanation))

def analysis_exists(country_code):
    db.execute("SELECT 1 FROM country_mentions WHERE country_code = ?", (country_code,))
    return db.fetchone() is not None

create_table()
txt_files = get_txt_files()

if txt_files:
    random.shuffle(txt_files)

    for txt_file in txt_files:
        country_code = txt_file.split('/')[-1].split('_')[0]
        
        if analysis_exists(country_code):
            print(f"Analysis for country {country_code} already exists. Skipping.")
            continue

        speech_content = read_speech_content(txt_file)
        response = generate_country_mentions(speech_content)
        print(f"Generated country mentions for country {country_code}")
        print(response.json())
        store_country_mentions(country_code, response)
        print(f"Stored country mentions for country {country_code}")
else:
    print("No .txt files found in the speeches directory.")
    
print("Analysis complete.")

Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country mn
{"mentions":[{"country":"Austria","country_code":"AT","sentiment":"supportive","explanation":"Mongolia collaborated with Austria to co-chair the Preparatory Committee for the Third United Nations Conference on Landlocked Developing Countries, signaling a positive relationship in tackling shared issues."},{"country":"Botswana","country_code":"BW","sentiment":"supportive","explanation":"Mongolia encourages all LLDCs and member states to engage in the upcoming conference in Botswana, indicating support for the country in facilitating significant discussions."}]}
Stored country mentions for country mn
Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country kz
{"mentions":[{"country":"Kazakhstan","country_code":"KZ","sentiment":"supportive","explanation":"The Deputy Prime Minister of Kazakhstan expresses full support for the President of the UN General Assembl

KeyboardInterrupt: 

In [2]:
from db import Database
import json
from typing import Dict, List, Tuple

def get_mentions_by_target(db: Database, sentiment: str) -> List[Tuple]:
    query = """
    SELECT 
        mentioned_country_code,
        mentioned_country,
        country_code,
        explanation 
    FROM 
        country_mentions 
    WHERE 
        sentiment = ?;
    """
    return db.execute(query, (sentiment,)).fetchall()

def process_mentions_by_target(results: List[Tuple]) -> Dict:
    return {
        row[0]: [
            {
                "mentioning_country_code": r[2].upper(),
                "explanation": r[3]
            }
            for r in results if r[0] == row[0]
        ]
        for row in results
    }

# Initialize database
db = Database()

# Get and process mentions
positive_received = process_mentions_by_target(get_mentions_by_target(db, 'supportive'))
negative_received = process_mentions_by_target(get_mentions_by_target(db, 'disproving'))

# Display results
print("Positive Mentions Received:")
print(json.dumps(positive_received, indent=4))
print("\nNegative Mentions Received:")
print(json.dumps(negative_received, indent=4))

# Save to files
for filename, data in [
    ('positive_received.json', positive_received),
    ('negative_received.json', negative_received)
]:
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)

Positive Mentions Received:
{
    "UA": [
        {
            "mentioning_country_code": "AL",
            "explanation": "The Prime Minister expresses strong support for Ukraine in the context of its struggle against Russian aggression, stating Albania's commitment to stand by Ukraine as long as necessary."
        },
        {
            "mentioning_country_code": "NL",
            "explanation": "The Prime Minister explicitly states support for Ukraine, condemning Russian aggression and calling for help for Ukraine at every stage of the peace process."
        },
        {
            "mentioning_country_code": "CZ",
            "explanation": "The speech supports Ukraine's sovereignty and territorial integrity, indicating that peace should be based on international law such as the principles discussed during the Peace Summit in Switzerland."
        },
        {
            "mentioning_country_code": "JP",
            "explanation": "Ukraine is referred to in the context of Russ