In [2]:
import json


with open('countries.json', 'r') as file:
    countryList = json.load(file)["countries"]

url_template = "https://gadebate.un.org/sites/default/files/gastatements/79/{code}_en.pdf"
pdf_urls = [url_template.format(code=country['iso_code'].lower()) for country in countryList]

print(pdf_urls)

['https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ae_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ag_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/al_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/am_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ao_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/at_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/au_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/az_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/ba_en.pdf', 'https://gadebate.un.org/sites/default/files/gastatements/79/bb_en.pdf', 'https://gadebate.un.org/sites/default/files/gasta

In [3]:
%pip install requests -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import requests
import time

os.makedirs('./speeches', exist_ok=True)

for url in pdf_urls:
    try:
        response = requests.get(url)
        response.raise_for_status()
        filename = os.path.join('./speeches', url.split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
        time.sleep(0.2)
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")


Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/ad_en.pdf
Downloaded: ./speeches/ae_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/af_en.pdf
Downloaded: ./speeches/ag_en.pdf
Downloaded: ./speeches/al_en.pdf
Downloaded: ./speeches/am_en.pdf
Downloaded: ./speeches/ao_en.pdf
Failed to download https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf: 404 Client Error: Not Found for url: https://gadebate.un.org/sites/default/files/gastatements/79/ar_en.pdf
Downloaded: ./speeches/at_en.pdf
Downloaded: ./speeches/au_en.pdf
Downloaded: ./speeches/az_en.pdf
Downloaded: ./speeches/ba_en.pdf
Downloaded: ./speeches/bb_en.pdf
Downloaded: ./speeches/bd_en.pdf
Downloaded: ./speeches/be_en.pdf
Failed to dow

In [5]:
%pip install pytesseract -q

Note: you may need to restart the kernel to use updated packages.


In [9]:
import pytesseract
from pdf2image import convert_from_path
import glob
import os

# Get list of PDF files in the folder
pdf_files = glob.glob('./speeches/*.pdf')

for pdf_file in pdf_files:
    # Determine the corresponding .txt file name
    txt_file = pdf_file.replace('.pdf', '.txt')
    
    # Check if the .txt file already exists
    if os.path.exists(txt_file):
        print(f"{txt_file} already exists. Skipping conversion.")
        continue
    
    # Convert PDF to images
    images = convert_from_path(pdf_file)
    
    # Perform OCR on each image and collect the text
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    
    # Write the text to a file with the same name but .txt extension
    with open(txt_file, 'w') as f:
        f.write(text)
    
    print(f"Converted {pdf_file} to {txt_file}")

./speeches/sc_en.txt already exists. Skipping conversion.
./speeches/so_en.txt already exists. Skipping conversion.
./speeches/gm_en.txt already exists. Skipping conversion.
./speeches/ee_en.txt already exists. Skipping conversion.
./speeches/tz_en.txt already exists. Skipping conversion.
./speeches/dk_en.txt already exists. Skipping conversion.
./speeches/tv_en.txt already exists. Skipping conversion.
./speeches/gy_en.txt already exists. Skipping conversion.
./speeches/bd_en.txt already exists. Skipping conversion.
./speeches/az_en.txt already exists. Skipping conversion.
./speeches/kp_en.txt already exists. Skipping conversion.
./speeches/na_en.txt already exists. Skipping conversion.
./speeches/kh_en.txt already exists. Skipping conversion.
./speeches/mk_en.txt already exists. Skipping conversion.
./speeches/ls_en.txt already exists. Skipping conversion.
./speeches/om_en.txt already exists. Skipping conversion.
./speeches/za_en.txt already exists. Skipping conversion.
./speeches/in_

In [7]:
%pip install openai python-dotenv  -q

Note: you may need to restart the kernel to use updated packages.


In [11]:
import glob
from llm import LLM
from pydantic import BaseModel, Field
from typing import List
from db import Database
import random

llm = LLM()
db = Database()

SYSTEM_PROMPT = "You are an expert in analyzing speeches for mentions of other countries."
USER_PROMPT_TEMPLATE = (
    "Read the provided speech text carefully. Your task is to identify whether any other countries are mentioned, either explicitly by name or implicitly through references to their actions, policies, or characteristics. Make sure to only include real countries (the United Nations of NATO are not considered countries). For each country mentioned, categorize if the speech is supportive or disproving of the current government of the country. Present your findings as a JSON array of objects, where each object includes the country two letter ISO code, the sentiment (supportive of disproving), and an explanation using markdown.\n\n{speech_content}"
)

class CountryMention(BaseModel):
    country: str = Field(..., description="The name of the mentioned country.")
    country_code: str = Field(..., description="The two letter ISO code of the mentioned country.")
    sentiment: str = Field(..., description="The sentiment of the mention (supportive or disproving).")
    explanation: str = Field(..., description="An explanation of how the country was mentioned.")

class CountryMentions(BaseModel):
    mentions: List[CountryMention] = Field(..., description="An array of objects representing the country mentions and their sentiments.")

def create_table():
    db.execute("""
    CREATE TABLE IF NOT EXISTS country_mentions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        country_code TEXT NOT NULL,
        mentioned_country TEXT NOT NULL,
        mentioned_country_code TEXT NOT NULL,
        sentiment TEXT NOT NULL,
        explanation TEXT NOT NULL
    );
    """)

def get_txt_files(directory='./speeches/*.txt'):
    return glob.glob(directory)

def read_speech_content(file_path):
    with open(file_path, 'r') as file:
        return file.read()

def generate_country_mentions(speech_content):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(speech_content=speech_content)}
    ]
    return llm.generate(messages, CountryMentions)

def store_country_mentions(country_code, country_mentions):
    for mention in country_mentions.mentions:
        db.execute("INSERT INTO country_mentions (country_code, mentioned_country, mentioned_country_code, sentiment, explanation) VALUES (?, ?, ?, ?, ?)", 
                   (country_code, mention.country, mention.country_code, mention.sentiment, mention.explanation))

def analysis_exists(country_code):
    db.execute("SELECT 1 FROM country_mentions WHERE country_code = ?", (country_code,))
    return db.fetchone() is not None

create_table()
txt_files = get_txt_files()

if txt_files:
    random.shuffle(txt_files)

    for txt_file in txt_files:
        country_code = txt_file.split('/')[-1].split('_')[0]
        
        if analysis_exists(country_code):
            print(f"Analysis for country {country_code} already exists. Skipping.")
            continue

        speech_content = read_speech_content(txt_file)
        response = generate_country_mentions(speech_content)
        print(f"Generated country mentions for country {country_code}")
        print(response.json())
        store_country_mentions(country_code, response)
        print(f"Stored country mentions for country {country_code}")
else:
    print("No .txt files found in the speeches directory.")
    
print("Analysis complete.")

Generating response with model gpt-4o-mini-2024-07-18
Generated country mentions for country uz
{"mentions":[{"country":"Afghanistan","country_code":"AF","sentiment":"supportive","explanation":"Uzbekistan expresses a pragmatic policy towards Afghanistan, emphasizing its readiness to contribute to the country's economic reconstruction and development of infrastructure."},{"country":"Central Asian countries","country_code":"CA","sentiment":"supportive","explanation":"Uzbekistan advocates for cooperation among Central Asian countries on regional issues, such as climate action and sustainable development, indicating a supportive sentiment towards these nations."},{"country":"Qatar","country_code":"QA","sentiment":"supportive","explanation":"Uzbekistan acknowledges and supports collaborative efforts with Qatar related to anti-corruption initiatives, indicating a positive view."}]}
Stored country mentions for country uz
Generating response with model gpt-4o-mini-2024-07-18
Generated country 

In [12]:
from db import Database
import json

db = Database()

query = """
SELECT 
    country_code, 
    mentioned_country,
    mentioned_country_code, 
    explanation 
FROM 
    country_mentions 
WHERE 
    sentiment = 'supportive';
"""

results = db.execute(query).fetchall()

positive_mentions = {}
for row in results:
    country_code, mentioned_country, mentioned_country_code, explanation = row
    if country_code not in positive_mentions:
        positive_mentions[country_code] = []
    positive_mentions[country_code].append({
        "mentioned_country_code": mentioned_country_code,
        "mentioned_country": mentioned_country,
        "explanation": explanation
    })

print(json.dumps(positive_mentions, indent=4))

# write the results to a JSON file called positive_mentions.json
with open('positive_mentions.json', 'w') as file:
    json.dump(positive_mentions, file, indent=4)
    

{
    "si": [
        {
            "mentioned_country_code": "UA",
            "mentioned_country": "Ukraine",
            "explanation": "The Prime Minister condemns the aggression against Ukraine, framing it as a violation of the UN Charter."
        },
        {
            "mentioned_country_code": "PS",
            "mentioned_country": "Palestine",
            "explanation": "The Prime Minister expresses concern for the rights and futures of Palestinian children, advocating for their well-being and a solution to the conflict."
        },
        {
            "mentioned_country_code": "AF",
            "mentioned_country": "Africa (as a continent)",
            "explanation": "The Prime Minister advocates for stronger representation of African voices in the UN Security Council."
        }
    ],
    "kz": [
        {
            "mentioned_country_code": "KZ",
            "mentioned_country": "Kazakhstan",
            "explanation": "The deputy prime minister expresses full suppo

In [8]:
from db import Database
import json

db = Database()

country_code_to_query = 'RU' 

query_disapproving = f"""
SELECT 
    country_code, 
    explanation 
FROM 
    country_mentions 
WHERE 
    mentioned_country_code = '{country_code_to_query}' AND 
    sentiment = 'disproving';
"""

results_disapproving = db.execute(query_disapproving).fetchall()

disapproving_mentions = {}
for row in results_disapproving:
    country_code, explanation = row
    if country_code not in disapproving_mentions:
        disapproving_mentions[country_code] = []
    disapproving_mentions[country_code].append(explanation)

print(json.dumps(disapproving_mentions, indent=4))

{
    "be": [
        "Russia is mentioned in a negative context as the aggressor in the invasion of Ukraine, criticizing its government and aggressive actions."
    ],
    "sm": [
        "San Marino condemns the Russian Federation's aggression against Ukraine, emphasizing support for Ukraine's political independence and territorial integrity."
    ],
    "ee": [
        "The speech explicitly condemns Russia for its aggression against Ukraine, characterizing it as a threat to international peace and security, along with accusations of brutal violations of international law."
    ]
}


In [14]:
query_sentiments = """
SELECT 
    mentioned_country_code,
    sentiment, 
    COUNT(*) as count 
FROM 
    country_mentions 
GROUP BY 
    mentioned_country_code, 
    sentiment;
"""

results_sentiments = db.execute(query_sentiments).fetchall()

sentiment_counts = {}
for row in results_sentiments:
    mentioned_country_code, sentiment, count = row
    if mentioned_country_code not in sentiment_counts:
        sentiment_counts[mentioned_country_code] = {'positive': 0, 'negative': 0}
    if sentiment == 'supportive':
        sentiment_counts[mentioned_country_code]['positive'] += count
    elif sentiment == 'disproving':
        sentiment_counts[mentioned_country_code]['negative'] += count

# Convert the dictionary to a list of tuples and sort by most positive, then most negative mentions
sorted_sentiments = sorted(sentiment_counts.items(), key=lambda x: (-x[1]['negative'], -x[1]['positive']))

for country_code, counts in sorted_sentiments:
    print(f"Country: {country_code}, Positive Mentions: {counts['positive']}, Negative Mentions: {counts['negative']}")

Country: IL, Positive Mentions: 25, Negative Mentions: 36
Country: UA, Positive Mentions: 48, Negative Mentions: 31
Country: RU, Positive Mentions: 7, Negative Mentions: 31
Country: US, Positive Mentions: 7, Negative Mentions: 22
Country: SD, Positive Mentions: 9, Negative Mentions: 18
Country: LB, Positive Mentions: 6, Negative Mentions: 18
Country: PS, Positive Mentions: 47, Negative Mentions: 17
Country: IR, Positive Mentions: 4, Negative Mentions: 13
Country: CU, Positive Mentions: 19, Negative Mentions: 10
Country: KP, Positive Mentions: 2, Negative Mentions: 9
Country: AF, Positive Mentions: 10, Negative Mentions: 8
Country: CN, Positive Mentions: 9, Negative Mentions: 8
Country: MM, Positive Mentions: 7, Negative Mentions: 7
Country: HT, Positive Mentions: 13, Negative Mentions: 6
Country: YE, Positive Mentions: 6, Negative Mentions: 6
Country: SY, Positive Mentions: 4, Negative Mentions: 6
Country: CD, Positive Mentions: 4, Negative Mentions: 5
Country: VE, Positive Mentions: 7