In [14]:
# Go to https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/0TJX8Y

# Unzip the file named 'dataverse_files' to a directory of your choice on your hard drive.
# [This may take a few minutes.]
# Untar UNGDC_1946-2023.tgz

# In the directory you have chosen, you now have more than 8,000 speeches in plain text format (with UTF-8 encoding).
# Each speech is named using the following convention: ISO 3166-1 alpha-3 country code, followed by the UN Session number, followed by year.
# E.g. USA_73_2018.txt is the full text of the speech the United States' representative gave during the UN's 73rd General Debate session in 2018.

# In the dir there's a list of folder each containing the speeches of a session. So `Session 01 - 1946` contains the speeches of the first session in 1946.
dir = "dataverse_files/UN General Debate Corpus/TXT"


In [15]:
# We're going to loop over all sessions and store the speech country (ISO code),
# session number, year and text content in a sqlite3 database.
# Schema of the database:
# country: ISO 3166-1 alpha-3 country code
# session: UN session number
# year: year of the speech
# text: full text of the speech
# The combination of country, session and year should be unique.

import sqlite3

conn = sqlite3.connect("un_speeches.db")
c = conn.cursor()

c.execute("""CREATE TABLE IF NOT EXISTS speeches (
    country TEXT,
    session INTEGER,
    year INTEGER,
    text TEXT,
    PRIMARY KEY (country, session, year)
)""")
conn.commit()

In [18]:
# Go over all the folders in the dir and load the speech data in the database
# Get ISO code, session number and year from the filename, not folder
# Filename looks like this: USA_73_2018.txt

import os

for folder in os.listdir(dir):
    folder_path = os.path.join(dir, folder)

    if not os.path.isdir(folder_path):
        continue

    for filename in os.listdir(folder_path):
        if not filename.endswith(".txt"):
            continue

        # First validate the the filename adheres to schema ISO_xx_xxxx.txt
        if len(filename.split("_")) != 3:
            print(f"Skipping file {filename} as it does not adhere to the schema.")
            continue

        country, session, year = filename.replace(".txt", "").split("_")
        with open(os.path.join(folder_path, filename), "r") as f:
            text = f.read()

        # Check if the entry already exists
        c.execute(
            "SELECT 1 FROM speeches WHERE country = ? AND session = ? AND year = ?",
            (country, session, year),
        )
        if c.fetchone() is None:
            print(f"Inserting {country}, {session}, {year}")
            c.execute(
                "INSERT INTO speeches VALUES (?, ?, ?, ?)",
                (country, session, year, text),
            )
            conn.commit()
        else:
            print(
                f"Entry for {country}, {session}, {year} already exists. Skipping insertion."
            )

conn.close()
print("Done.")

Entry for BEL, 05, 1950 already exists. Skipping insertion.
Entry for BLR, 05, 1950 already exists. Skipping insertion.
Entry for FRA, 05, 1950 already exists. Skipping insertion.
Entry for PAK, 05, 1950 already exists. Skipping insertion.
Entry for TUR, 05, 1950 already exists. Skipping insertion.
Entry for BRA, 05, 1950 already exists. Skipping insertion.
Entry for ETH, 05, 1950 already exists. Skipping insertion.
Entry for DOM, 05, 1950 already exists. Skipping insertion.
Entry for URY, 05, 1950 already exists. Skipping insertion.
Entry for PAN, 05, 1950 already exists. Skipping insertion.
Entry for VEN, 05, 1950 already exists. Skipping insertion.
Entry for YUG, 05, 1950 already exists. Skipping insertion.
Entry for GRC, 05, 1950 already exists. Skipping insertion.
Entry for PHL, 05, 1950 already exists. Skipping insertion.
Entry for POL, 05, 1950 already exists. Skipping insertion.
Entry for CHN, 05, 1950 already exists. Skipping insertion.
Entry for ECU, 05, 1950 already exists. 