## Cleaning and Extraction of Speech Transcripts

### Text Extraction from Word Documents OCR’d with ABBYY FineReader

In [None]:
# Define the parent folder containing the original Word files
parent_folder = r"PATH_TO_ORIGINAL_WORD_FILES"  # Replace with your folder path containing Word files

# Define the output folder for saving the extracted text files
output_folder = r"PATH_TO_EXTRACTED_TEXT"  # Replace with your folder path for saving processed text files

In [None]:
import os
from docx import Document
from langdetect import detect

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to check if a paragraph is in English using langdetect
def is_english_langdetect(paragraph_text):
    try:
        return detect(paragraph_text) == 'en'
    except Exception:
        return False  # If detection fails, assume the text is not English

# Iterate over all Word files in the parent folder
for file_name in os.listdir(parent_folder):
    if file_name.endswith('.docx'):
        # Load the document
        file_path = os.path.join(parent_folder, file_name)
        doc = Document(file_path)

        # Initialize text variable for processed content
        processed_text_langdetect = ""

        # Process each paragraph in the document
        for para in doc.paragraphs:
            if para.text.strip() and is_english_langdetect(para.text):
                # Remove hyphenation and newlines within paragraphs
                processed_paragraph = para.text.strip().replace('-\n', '').replace('\n', ' ')
                processed_text_langdetect += processed_paragraph + "\n\n"

        # Save the processed text to a new text file
        output_file_name = os.path.splitext(file_name)[0] + ".txt"
        output_file_path = os.path.join(output_folder, output_file_name)

        with open(output_file_path, "w", encoding="utf-8") as text_file:
            text_file.write(processed_text_langdetect.strip())

        print(f"Processed and saved: {output_file_path}")

### Cleaning and Parsing the Extracted Text

In [None]:
# Get file paths for the target text files
import glob

# Change the following directory to your temporary text files folder
# in_dir = "PATH_TO_TEMP_FILES/"
in_dir = "PATH_TO_TEMP_FILES/"
files = glob.glob(in_dir + '*.txt')
num_files = len(files)
print("{} files".format(num_files))

In [None]:
# Load the correction mapping table
import pandas as pd

# Replace with your corrections file path
corr_df = pd.read_csv("PATH_TO_CORRECTIONS_FILE/corrections.csv")
corr_df.dropna(inplace=True)
corr_df.head(10)

In [None]:
# Create a dictionary for the correction mappings
before = list(corr_df.before)
after = list(corr_df.after)
corr_list = [(str(before[i]).strip(), str(after[i]).strip()) for i in range(len(before))]
corr_dict = dict(corr_list)
keys = corr_dict.keys()
len(corr_dict)

In [None]:
corr_dict["The PRESIDENT;"]

In [None]:
# Load the titles (honorific and position labels) table
# Replace with your titles file path
titles_df = pd.read_csv("PATH_TO_TITLES_FILE/titles.csv", dtype=str)
titles_df.head(10)

In [None]:
# Build lists for each category
position_list = list(titles_df[titles_df["type"] == "position"]["title"])
title_pre_list = list(titles_df[titles_df["type"] == "pre"]["title"])
title_post_list = list(titles_df[titles_df["type"] == "not_pre"]["title"])
print(len(position_list), len(title_pre_list), len(title_post_list))

In [None]:
# Check the contents
title_pre_list[:10]

In [None]:
# Build regular expressions including honorific/position labels
positions_or = "|".join([str(pos).strip() for pos in position_list])
pre_or = "|".join([str(title).strip() for title in title_pre_list])
post_or1 = "|".join([str(title).strip() for title in title_post_list])
post_or2 = "|".join([r"[^\(\):]+\s" + str(title).strip() for title in title_post_list])
titles_all_or = "|".join([pre_or, post_or2, positions_or])
titles_all_or

In [None]:
# Clean and format the meeting transcript text files
import re
from langdetect import detect

def clean_PV(path):

    # Compile regular expressions for identifying document start and end
    reg_start = re.compile(r"^(the meeting (was called to order|resumed|was resumed|was suspended)|held (at|in)).+$", re.IGNORECASE)
    reg_start_lenient = re.compile(r"^(the meeting was called to order|held (at|in)).+$", re.IGNORECASE)
    reg_agenda = re.compile(r"^the agenda was (adopted|that).+$", re.IGNORECASE)
    reg_end = re.compile(r"^(the meeting (rose|was suspended)|the council (rose|was suspended)).+$", re.IGNORECASE)
    reg_end_lenient = re.compile(r"^(the meeting rose|the council rose).+$", re.IGNORECASE)

    # Regular expressions to identify headers and footers
    reg_head1 = re.compile(r"^(\d+|\d+-\d+|\d+/\d+.{0,1}|\d{2}/\d{2}/\d{4}|\d{2}-\d{5}.*|S/PV\.\s*\d{4})$")
    reg_head2 = re.compile(r"^(\d{2}/\d{2}/\d{4}\s+\S+\s+S/PV\.\s*\d{4}|S/PV\.\s*\d{4}\s+\S+\s+\d{2}/\d{2}/\d{4})$")
    reg_head3 = re.compile(r"^(\d{4}\D{2}\smeeting|Security Council|(Twe|Thi|For|Fif|Six|Sev)\D+\syear|\d+\s\S+\s\d{4})$")
    reg_head4 = re.compile("^(\(|\{)(("+pre_or+").+?(\,.+)*?|"+"("+post_or2+")\s*?(\,.+)*?|"+positions_or+")(\)|\})$", re.IGNORECASE)

    # Regular expression to identify footnotes
    reg_footnote = re.compile(r"^(\d+\s*/\s+.*$|(\d+?\/\s+|)(ibid\.|op\.cit\.|idem|notes see\s+?|see\s+.*?official records of the (general assembly|security council)|see\s+.*?(general assembly|security council) resolution|official records of the (general assembly|security council)|for the text of the report see).*$)", re.IGNORECASE)

    # Matching for paragraph numbering and punctuation at the beginning
    reg_paranum = re.compile(r"^(|\d|\d\s*?\d|\d\s*?\d\s*?\d)\s*?(\.|\,|:|;|\*\.|\.\*|\^\.|■)\s*(.+)$")
    # Corrections for misrecognized articles
    reg_article1 = re.compile(r"(^|\s+)(t|T)(he|ho|te|h®|h©|ha|h\*|he •)\s+")
    reg_article2 = re.compile(r"(^|\s+)(z|Z)he\s+")
    # Corrections for misrecognized 'president'
    reg_president = re.compile(r"^(the president|the presidemt|tho president|tte president|th® prkideht|th© president|tha president|th\* president|zhe president|the presment|the président|the • president)", re.IGNORECASE)
    reg_president_inclusive = re.compile(r"(^|\s+)(the|The)\s+?(p|P)(resident|residemt|rkideht|residet|resment|résident|RESIDENT|RESIDEMT|RKIDEHT|RESIDET|RESMENT|RÉSIDENT)")
    # Corrections for misrecognized honorifics
    reg_mr = re.compile(r"^(Me|Hr|Mf|Uh|Nr)\.\s+")
    reg_ms = re.compile(r"^(Ma)\.\s+")
    reg_title = re.compile(r"^(mr|mrs|ms|dr)(|\,|;|:|»|\s+?\.)(\s+)", re.IGNORECASE)
    reg_title_inclusive = re.compile(r"(^|\s+)(Mr|Mrs|Ms|Dr)(|\,|;|:|»|\s+?\.)\s+")
    reg_title_nospace = re.compile(r"(^|\s+)(Mr|Mrs|Ms|Dr)(|\.|\,|;|:|»)([A-Z])")
    # Correction for double parentheses misrecognition
    reg_doubleparenth = re.compile("^("+titles_all_or+")([^\(\):]*?\([^\(\):]+?\))\S\s*?(\([^\(\):]+?\))\s*?:\s+", re.IGNORECASE)
    # Exclude cases where a colon is misrecognized (e.g., listing attendees)
    reg_present = re.compile("^(("+pre_or+")[^\(\):]+?\([^\(\):]+?\)\,\s*?){2,}?", re.IGNORECASE)
    # Corrections for colon misrecognition: cases following parentheses and in chairman’s statements
    reg_colon1 = re.compile("^("+titles_all_or+")([^\(\):]*?\([^\(\):]+?\))\s*?(\([^\(\):]+?\))*?\s*?(:|;|\,|\.|’\.|'\.|-\.|!|\?|i|r|t|s|\*\.|\.\*|\*|»|\,»|\"\,|\}\.|\.•)\s+", re.IGNORECASE)
    reg_colon2 = re.compile("^("+positions_or+")\s*?(:|;|’\.|'\.|-\.|!|\?|r|t|s|\*\.|\.\*|\*|»|\,»|\"\,|\}\.|\.•)\s+", re.IGNORECASE)
    reg_colon3 = re.compile("^The (President|PRESIDENT)(|\s*?(i|\.|\,))\s+(I|Members of the Council have before them|In accordance with|The Security Council will now|The first speaker is|The next speaker is|There are no further speakers|There are no more speakers|There were \d+ votes)\s+")
    # Matching for in-sentence speaker changes
    reg_insent1 = re.compile("(.+)("+titles_all_or+")([^\(\):]*?\([^\(\):]+?\))\s*?(\([^\(\):]+?\))*?\s*?:\s+", re.IGNORECASE)
    reg_insent2 = re.compile("(.+)("+positions_or+")\s*?:\s+", re.IGNORECASE)

    # Correction for misrecognized 'I' as 'X/x'
    reg_ix = re.compile(r"(^|\s+)(X|x)(|t|ts|n|f)\s+?(\w)")
    # Matching for headers containing speaker info at the beginning
    reg_headin = re.compile(r"(^|\s+)\((("+pre_or+")[^\(\)]+?(\,[^\(\)]+)*?|"+"("+post_or2+")\s*?(\,[^\(\)]+)*?|"+positions_or+")\)\s+", re.IGNORECASE)
    # Matching for extra symbols
    reg_remove = re.compile(r"^\*(\s|\*)+$")
    # Matching for hyphenation within words
    reg_hyphen = re.compile(r"^(.+)\w\-$")

    # Open the file and start processing
    with open(path, 'r') as f:
        txt = f.read()

        # First apply the correction mapping to the entire text
        for key in keys:
            txt = txt.replace(key, corr_dict[key])

        # Process the text line by line
        lines = txt.split("\n")
        num_lines = len(lines)
        main = False
        processed = []
        processed2 = []

        for i, line in enumerate(lines):
            line = str(line).strip()

            # Skip empty lines
            if len(line) == 0:
                continue

            # Correct typical misrecognitions
            line = line.replace("{", "(").replace("[", "(").replace("}", ")").replace("]", ")")
            line = reg_paranum.sub(r'\3', line).strip()
            line = reg_ix.sub(r"\1I\3 \4", line)
            line = reg_article1.sub(r"\1\2he ", line)
            if reg_article2.match(line):
                if line[0].isupper():
                    line = reg_article2.sub(r"\1The ", line)
                else:
                    line = reg_article2.sub(r"\1the ", line)
            line = reg_president_inclusive.sub(r"\1\2 \3resident", line)
            line = reg_mr.sub(r"Mr. ", line)
            line = reg_ms.sub(r"Ms. ", line)
            line = reg_title_inclusive.sub(r"\1\2. ", line)
            line = reg_title_nospace.sub(r"\1\2. \4", line)
            reg_doubleparenth.sub(r"\1\2\3: ", line)
            if not reg_present.match(line):
                line = reg_colon1.sub(r"\1\2\3: ", line)
            line = reg_colon2.sub(r"\1: ", line)
            line = reg_colon3.sub(r"The \1: \4 ", line)
            line = line.replace("::", ":")
#             line = reg_insent1.sub(r"\1\n\n\2\3\4: ", line)
#             line = reg_insent2.sub(r"\1\n\n\2: ", line)
            if reg_headin.match(line):
                line = reg_headin.sub("", line)

            # If the line starts with a lowercase letter, attach it to the previous paragraph
            if (i > 0) and (line[0].islower()):
                if main:
                    if len(processed) > 0:
                        preceding = processed[-1]
                        if reg_hyphen.match(preceding):
                            line = (preceding.replace("-", "") + line)
                        else:
                            line = (preceding + " " + line)
                        processed[-1] = line
                        continue
                else:
                    if len(processed2) > 0:
                        preceding = processed2[-1]
                        if reg_hyphen.match(preceding):
                            line = (preceding.replace("-", "") + line)
                        else:
                            line = (preceding + " " + line)
                        processed2[-1] = line
                        continue

            if len(line) == 0:
                continue
            if (reg_head1.match(line)) or (reg_head2.match(line)) or (reg_head3.match(line)) or (reg_head4.match(line)):
                continue
            if reg_footnote.match(line):
                continue
            if reg_remove.match(line):
                continue

#             try:
#                 if detect(line) != 'en':
#                     continue
#             except:
#                 continue

            # Determine if we are in the main body of the document
            if not main:
                if reg_start_lenient.match(line):
                    main = True
#                 if reg_agenda.match(line):
#                     main = True

            if main:
                processed.append(line.strip())
            else:
                processed2.append(line.strip())

            if main and reg_end_lenient.match(line):
                break

        if main:
            text = '\n\n'.join(processed).strip()
        else:
            text = '\n\n'.join(processed2).strip()

        num_words = len(text.split())
        num_paras = len(text.split('\n\n'))

    return text, num_words, num_paras

In [None]:
# Process and clean the target files
import os

# Set the output directory for cleaned texts
out_dir = r"PATH_TO_CLEANED_TEXT"  # Replace with your folder path for cleaned text files
# Tag to append to the cleaned file names
tag_out = "_clean"

# Regular expression for extracting meeting numbers from file names
reg_meetingnum = re.compile(r"S_PV\.(\d+).*?\.txt")

# List to record meeting number, document name, word count, paragraph count, and file path
results = []
for i, f_path in enumerate(files):
    if i % 100 == 0:
        print("Processing document {}/{}".format(i+1, num_files))
    meeting_num = int(reg_meetingnum.sub(r"\1", os.path.basename(f_path)))
    file_name = os.path.splitext(os.path.basename(f_path))[0]
    text, num_words, num_paras = clean_PV(f_path)
    out_file = file_name + tag_out + ".txt"
    out_path = os.path.join(out_dir, out_file)
    with open(out_path, "w") as f:
        f.write(text)
    results.append([meeting_num, file_name, num_words, num_paras, out_path])
print("{} files have been processed.".format(len(results)))

In [None]:
# Save the cleaned document list as a DataFrame
cols = ["meeting_num", "doc_name", "word_count", "para_count", "path"]
df_cleaned = pd.DataFrame(results, columns=cols).sort_values(["meeting_num", "doc_name"]).reset_index(drop=True)
df_cleaned.head(20)

In [None]:
df_cleaned.shape

In [None]:
# Save the document list
out_path = r"PATH_TO_OUTPUT_DF/UNSC_PV_cleaned.csv"  # Replace with your output path
df_cleaned.to_csv(out_path, na_rep='NULL')

### Merging Meeting Data with the Cleaned Document List

In [None]:
# Load the cleaned document list DataFrame
import pandas as pd

# Replace with your cleaned document list file path
in_path = r"PATH_TO_OUTPUT_DF/UNSC_PV_cleaned.csv"
df_cleaned = pd.read_csv(in_path, na_values='NULL', index_col=0)
df_cleaned.head(20)

In [None]:
# Load the meetings data
# Replace with your meetings file path
in_path = r"PATH_TO_MEETINGS_FILE/meetings.tsv"
df_meetings = pd.read_csv(in_path, na_values='NULL', index_col=0, sep='\t')
df_meetings.tail(10)

In [None]:
# Merge the document list with the meeting data to augment with meeting details
# Also update the word count in the meeting data
import numpy as np

# List of documents to skip
to_skip = ["S_PV.2977(PartI)-EN", "S_PV.2977(PartII)(closed)-EN", "S_PV.2977(PartII)(closed-resumption1)-EN",
           "S_PV.2977(PartII)(closed-resumption2)-EN", "S_PV.2977(PartII)(closed-resumption3)-EN",
           "S_PV.2977(PartII)(closed-resumption4)-EN", "S_PV.2977(PartII)(closed-resumption5)-EN", "S_PV.3160_RU"]

num_docs = df_cleaned.shape[0]
results = []
# For each document, match and extract the meeting info
for i in range(num_docs):
    if i % 100 == 0:
        print("Processing document {}/{}".format(i+1, num_docs))
    series = df_cleaned.iloc[i]
    meeting_num = series["meeting_num"]
    doc_name = series["doc_name"]
    if doc_name in to_skip:
        results.append(["NULL", -1, -1, -1, "NULL", "NULL", "NULL", "NULL"])
        continue
    select = (df_meetings["meeting_num"] == meeting_num)
    if select.sum() > 1:
        print("multiple matches", doc_name)
        df_meetings_select = df_meetings[select]
        select2 = (df_meetings_select["record"] == doc_name.replace("_", "/"))
        if select2.sum() == 0:
            print("in the end, no match", doc_name)
            results.append(["NULL", -1, -1, -1, "NULL", "NULL", "NULL", "NULL"])
        else:
            match = df_meetings_select[select2].iloc[0]
            record_id = match["record_id"]
            year = match["year"]
            month = match["month"]
            day = match["day"]
            topic = match["topic"]
            agenda = match["agenda"]
            pres_name = match["pres_name"]
            pres_country = match["pres_country"]
            results.append([record_id, year, month, day, topic, agenda, pres_name, pres_country])
            df_meetings.loc[match.name, "word_count"] = int(series["word_count"])
    elif select.sum() == 0:
        print("no match", doc_name)
        results.append(["NULL", -1, -1, -1, "NULL", "NULL", "NULL", "NULL"])
    else:
        match = df_meetings[select].iloc[0]
        record_id = match["record_id"]
        year = match["year"]
        month = match["month"]
        day = match["day"]
        topic = match["topic"]
        agenda = match["agenda"]
        pres_name = match["pres_name"]
        pres_country = match["pres_country"]
        results.append([record_id, year, month, day, topic, agenda, pres_name, pres_country])
        df_meetings.loc[match.name, "word_count"] = int(series["word_count"])
print("{} documents have been processed.".format(len(results)))

In [None]:
# Combine the extracted meeting data with the document list
cols = ["record_id", "year", "month", "day", "topic", "agenda", "pres_name", "pres_country"]
df_to_add = pd.DataFrame(results, columns=cols)
df_combined = pd.concat([df_cleaned, df_to_add], axis=1)
df_combined.head(10)

In [None]:
df_combined.shape

In [None]:
# Save the augmented document list
out_path = r"PATH_TO_OUTPUT_DF/UNSC_PV_cleaned.csv"  # Replace with your output path
df_combined.to_csv(out_path, na_rep='NULL')

In [None]:
df_combined.tail(10)

In [None]:
# Check the updated meetings data
df_meetings.tail(10)

In [None]:
df_meetings.shape

In [None]:
# Save the updated meetings data
out_path = r"PATH_TO_MEETINGS_FILE/meetings.tsv"  # Replace with your meetings output path
df_meetings.to_csv(out_path, sep='\t', na_rep='NULL')

### Extracting Speaker and Speech Data from Cleaned Text

In [None]:
# Load the cleaned document list
import pandas as pd

# Replace with your cleaned document list file path
in_path = r"PATH_TO_OUTPUT_DF/UNSC_PV_cleaned.csv"
df_cleaned = pd.read_csv(in_path, na_values='NULL', index_col=0)
df_cleaned.head(10)

In [None]:
df_cleaned.shape

In [None]:
# Remove rows with no matching meeting data
df_cleaned.dropna(subset=["record_id"], inplace=True)
df_cleaned.shape

In [None]:
# Load the titles table
titles_df = pd.read_csv("PATH_TO_TITLES_FILE/titles.csv", dtype=str)
titles_df.head(10)

In [None]:
# Build lists for each category
position_list = list(titles_df[titles_df["type"] == "position"]["title"])
title_pre_list = list(titles_df[titles_df["type"] == "pre"]["title"])
title_post_list = list(titles_df[titles_df["type"] == "not_pre"]["title"])
print(len(position_list), len(title_pre_list), len(title_post_list))

In [None]:
# Build regular expressions for honorific/position labels
positions_or = "|".join([str(pos).strip() for pos in position_list])
pre_or = "|".join([str(title).strip() for title in title_pre_list])
post_or1 = "|".join([str(title).strip() for title in title_post_list])
post_or2 = "|".join([r"[^\(\):]+\s" + str(title).strip() for title in title_post_list])
positions_or

In [None]:
# Function to determine if a string is a name (only surname)
import re

def is_nameonly(string, name_thres=5):
    reg_particles1 = re.compile(r"\s+(of|the|de|la|le|du|des|les|von|zu|auf|und|van|der|do|dos|da|das|e|del|el|of that Ilk|di|dei|de\’|de\'|della|dal|dalla|dai|tot|thoe|af|aw|ag|al|à|den|wa)\b")
    reg_particles2 = re.compile(r"\s+(d\'|d\’|el\-|al\-)")
    reg_space = re.compile(r"\s+")
    string = reg_space.sub(r" ", reg_particles1.sub(r" ", reg_particles2.sub(r" ", string)))
    parts = string.split()
    if len(parts) > name_thres:
        return False
    for part in parts:
        if part[0].islower():
            return False
    return True

# Function to extract speaker and speech information from a transcript
def speech_extraction(path, speaker_thres=5, remove_parentheses=True, remove_quotes=True):
    import re

    # Regular expressions for different speaker patterns
    reg_speaker1 = re.compile("^("+pre_or+")(\s+?\w+?[^\(\):]*?)(\([^\(\):]+?\))??\s*?(\([^\(\):]+?\))*?\s*?:\s*?(.+)$", re.IGNORECASE)
    reg_speaker2 = re.compile("^("+pre_or+")(\s+?\w+?[^\(\):]*?)\(([^\(\):]+?)\)\s*?(\([^\(\):]+?\))*?\s*?:\s*?(.+)$", re.IGNORECASE)
    reg_speaker3 = re.compile("^([^\(\):]+?\s)("+post_or1+")([^\(\):]*?)(\([^\(\):]+?\))??\s*?(\([^\(\):]+?\))*?\s*?:\s*?(.+)$", re.IGNORECASE)
    reg_speaker4 = re.compile("^([^\(\):]+?\s)("+post_or1+")([^\(\):]*?)\(([^\(\):]+?)\)\s*?(\([^\(\):]+?\))*?\s*?:\s*?(.+)$", re.IGNORECASE)
    reg_speaker5 = re.compile("^("+positions_or+")\s*?(\([^\(\)]+?\))*?\s*?:\s*?(.+)$", re.IGNORECASE)
    reg_speaker6 = re.compile("^(mr\. president|mr\. president|mrs\. president)\s*?(\([^\(\)]+?\))*?\s*?:\s*?(.+)$", re.IGNORECASE)

    reg_notcountry = re.compile(r"\([^\(\):]*?(transl|interpret|spoke)[^\(\):]+?\)", re.IGNORECASE)

    reg_notprocedural = re.compile(r"(in my capacity as|in my national capacity|resume my functions as)", re.IGNORECASE)

    reg_parentheses = re.compile(r"\([^\(\)]+?\)", re.IGNORECASE)

    reg_quote1 = re.compile(r"“[^“”\n]+?(”|\n)")
    reg_quote2 = re.compile(r"\"[^\"\"\n]+?(\"|\n)")

    reg_start = re.compile(r"^(the meeting (was called to order|resumed|was resumed|was suspended)|held (at|in)).+$", re.IGNORECASE)
    reg_agenda = re.compile(r"^the agenda was (adopted|that).+$", re.IGNORECASE)
    reg_end = re.compile(r"^(the meeting (rose|was suspended)|the council (rose|suspended)).+$", re.IGNORECASE)
    reg_end_search = re.compile(r"(the meeting (rose|was suspended)|the council (rose|suspended))", re.IGNORECASE)
    reg_sectitle = re.compile(r"^(adoption of the agenda|expression of.+|opening statement).*$", re.IGNORECASE)
    reg_descript = re.compile(r"^(there being no objection|.*?it (is|was) so decided|it (is|was) so agreed|at the invitation of the president).*$", re.IGNORECASE)
    reg_vote = re.compile(r"^((a vote|vote|a secret ballot) was taken|(the proposal|the draft resolution) was (rejected|adopted)|in (favour|favor):|.*?against:|.*?abstaining:).*$", re.IGNORECASE)

    reg_space1 = re.compile(r"\s—\s")
    reg_space2 = re.compile(r"\s\s+")
    reg_space3 = re.compile(r"\s+")

    with open(path, 'r') as f:
        txt = f.read()

        to_end = len(reg_end_search.findall(txt))
        if to_end == 0:
            to_end = 999

        paras = txt.split("\n\n")
        num_para = len(paras)
        order = 0
        para_list = []
        procedural = False
        speakers = []
        countries = []
        speeches = []
        procedural_flags = []
        records = []
        breakout = False
        for i, para in enumerate(paras):
            para = para.strip()

            if reg_speaker1.match(para) or reg_speaker3.match(para) or reg_speaker5.match(para) or reg_speaker6.match(para):
                speech_start = False

                para = reg_notcountry.sub("", para)
                para = reg_speaker6.sub(r"The President \2: \3", para)

                if reg_speaker1.match(para):
                    speaker = reg_speaker1.sub(r"\1\2", para).strip()
                    if reg_speaker2.match(para):
                        country = reg_speaker2.sub(r"\3", para).strip()
                    else:
                        country = "n.a."
                    body = reg_speaker1.sub(r"\5", para).strip()
                    if is_nameonly(speaker, speaker_thres):
                        speech_start = True
                        para = body
                elif reg_speaker3.match(para):
                    speaker = reg_speaker3.sub(r"\1\2\3", para).strip()
                    if reg_speaker4.match(para):
                        country = reg_speaker4.sub(r"\4", para).strip()
                    else:
                        country = "n.a."
                    body = reg_speaker3.sub(r"\6", para).strip()
                    if is_nameonly(speaker, speaker_thres):
                        speech_start = True
                        para = body
                elif reg_speaker5.match(para):
                    speech_start = True
                    speaker = reg_speaker5.sub(r"\1", para).strip()
                    if speaker.lower() == "the president":
                        speaker = "The President"
                        country = "n.a."
                    elif speaker.lower() == "the chairman":
                        speaker = "The Chairman"
                        country = "n.a."
                    elif speaker.lower() == "the acting president":
                        speaker = "The Acting President"
                        country = "n.a."
                    else:
                        country = "United Nations"
                    para = reg_speaker5.sub(r"\3", para).strip()

                if speech_start:
                    order += 1
                    speakers.append(speaker)
                    countries.append(country)

                    if order > 1:
                        speech = "\n".join([block for block in para_list if len(block) > 0])
                        speeches.append(speech)
                        procedural_flags.append(procedural)
                        para_list = []

                    if (speaker.lower() == "the president") or (speaker.lower() == "the chairman") or (speaker.lower() == "the acting president"):
                        procedural = True
                    else:
                        procedural = False

            if order == 0:
                continue

            if reg_end.match(para):
                to_end -= 1
                if to_end == 0:
                    speech = "\n".join(para_list)
                    speeches.append(speech)
                    procedural_flags.append(procedural)
                    para_list = []
                    breakout = True
                    break

            if (reg_start.match(para)) or (reg_agenda.match(para)) or (reg_end.match(para)) or (reg_sectitle.match(para)) or (reg_descript.match(para)) or (reg_vote.match(para)):
                continue

            if (speaker.lower() == "the president") or (speaker.lower() == "the chairman"):
                if reg_notprocedural.search(reg_space2.sub(' ', para.replace('\n', ' '))):
                    procedural = False

            if remove_parentheses:
                para = reg_parentheses.sub("", para)

            if remove_quotes:
                para = reg_space2.sub(' ', reg_space1.sub(" ", reg_quote1.sub("", para)))
                para = reg_space2.sub(' ', reg_space1.sub(" ", reg_quote2.sub("", para)))

            para = reg_space3.sub(" ", para.strip())
            para_list.append(para)

        if not breakout:
            speech = "\n".join(para_list)
            speeches.append(speech)
            procedural_flags.append(procedural)
            para_list = []

        if order == 0:
            print("no speaker", path)
            return None

        num_speakers = len(speakers)
        num_speeches = len(speeches)

        if num_speakers != num_speeches:
            print("speaker-speech discrepancies", num_speakers, num_speeches, path)
            return None

        counter = 1
        for i in range(num_speakers):
            speaker = speakers[i]
            country = countries[i]
            speech = speeches[i]
            count = len(speech.split())
            if count == 0:
                continue
            procedural = procedural_flags[i]
            row = [counter, speaker, country, speech, count, procedural]
            records.append(row)
            counter += 1

    return records

In [None]:
# Extract text from each file and process speaker/speech data
import os

# Options
save = True
speaker_thres = 5
remove_parentheses = True
remove_quotes = False

# Set the output directory for extracted speech texts
out_dir = r"PATH_TO_SPEECH_OUTPUT"  # Replace with your folder path for speech output files
tag_in = "_clean"
tag_out = "_extracted"

# List with additional information for Secretary General (start year and name)
SG_list = [(2017, 'Mr. Guterres'), (2007, 'Mr. Ban'), (1997, 'Mr. Annan'), (1992, 'Mr. Boutros-Ghali'),
          (1982, 'Mr. Perez de Cuellar'), (1972, 'Mr. Waldheim'), (1961, 'U Thant'),
          (1953, 'Mr. Hammarskjöld'), (1946, 'Mr. Lie')]

num_docs = df_cleaned.shape[0]

# Filter if needed
filt = [True] * num_docs
df_select = df_cleaned[filt]

speech_flags = []
data = []
for i in range(num_docs):
    if i % 100 == 0:
        print("Preprocessing Doc {} (out of {})...".format(i+1, num_docs))

    # Get meeting information
    series = df_select.iloc[i]
    record_id = series["record_id"]
    doc_name = series["doc_name"]
    meeting_num = series["meeting_num"]
    year = series["year"]
    month = series["month"]
    day = series["day"]
    topic = series["topic"]
    agenda = series["agenda"]
    pres_name = series["pres_name"]
    pres_country = series["pres_country"]
    path = series['path']

    # Extract speech records
    records = speech_extraction(path, speaker_thres=speaker_thres, remove_parentheses=remove_parentheses,
                                remove_quotes=remove_quotes)

    if records is None:
        speech_flags.append(False)
        continue

    for record in records:
        order = record[0]
        speaker = record[1]
        country = record[2]
        speech = record[3]
        count = record[4]
        procedural = record[5]
        president = False
        secretary_general = False

        speech_id = record_id + "-" + str(order).zfill(3)

        if (speaker.lower() == "the president") or (speaker.lower() == "the acting president"):
            president = True
            speaker = str(pres_name) if pres_name not in ["", "nan"] else "n.a."
            country = str(pres_country) if pres_country not in ["", "nan"] else "n.a."
        elif speaker.lower().startswith("the secretary") and speaker.lower().endswith("general"):
            secretary_general = True
            for start, person in SG_list:
                if year >= start:
                    speaker = person
                    break
            country = "United Nations"
        elif speaker.lower() == "the chairman":
            president = True
            speaker = "Mr. N. J. O. MAKIN"
            country = "Australia"

        row = [speech_id, record_id, doc_name, meeting_num, year, month, day, topic, agenda, order, speaker, country, president, secretary_general, procedural, count, speech]
        data.append(row)
    speech_flags.append(True)

    if save:
        out_file = os.path.basename(path).replace(tag_in, tag_out)
        out_path = os.path.join(out_dir, out_file)
        with open(out_path, "w") as f:
            for record in records:
                line1 = "{}, {}, {}, {}\n".format(record[0], record[1], record[2], record[-1])
                line2 = record[3] + "\n\n"
                f.write(line1)
                f.write(line2)

print('{} records have been processed.'.format(len(speech_flags)))
print('{} speeches have been extracted.'.format(len(data)))

In [None]:
# Create a DataFrame from the extracted speech data
labels = ["speech_id", "record_id", "doc_name", "meeting_num", "year", "month", "day", "topic", "agenda",
          "order", "speaker", "country", "president", "secretary_general", "procedural", "count", "speech"]
df_speeches = pd.DataFrame(data, columns=labels)
df_speeches.head(20)

In [None]:
# Check the shape of the speeches DataFrame
df_speeches.shape

In [None]:
# Add a flag in the document list indicating availability of speech data
df_select["speech_data"] = speech_flags
df_select.head(10)

In [None]:
# Check the shape of the updated document list
df_select.shape

In [None]:
# Count the number of transcripts with no speech data
(df_select.speech_data == False).sum()

In [None]:
# Save the new speech data
out_path = r"PATH_TO_OUTPUT_DF/speeches_new.tsv"  # Replace with your output path for new speeches
df_speeches.to_csv(out_path, sep='\t', na_rep='NULL')

In [None]:
# Save the updated document list
out_path = r"PATH_TO_OUTPUT_DF/UNSC_PV_cleaned.csv"
df_select.to_csv(out_path, na_rep='NULL')

### Reflecting Speech Data Availability in the Meetings Data

In [None]:
# Reload the updated document list
import pandas as pd

in_path = r"PATH_TO_OUTPUT_DF/UNSC_PV_cleaned.csv"
df_cleaned = pd.read_csv(in_path, na_values='NULL', index_col=0)
df_cleaned.head(10)

In [None]:
# Reload the meetings data
in_path = r"PATH_TO_MEETINGS_FILE/meetings.tsv"
df_meetings = pd.read_csv(in_path, na_values='NULL', index_col=0, sep='\t')
df_meetings.tail(10)

In [None]:
# Update the meetings data with the speech_data flag from the document list
import numpy as np

to_skip = ["S_PV.2977(PartI)-EN", "S_PV.2977(PartII)(closed)-EN", "S_PV.2977(PartII)(closed-resumption1)-EN",
           "S_PV.2977(PartII)(closed-resumption2)-EN", "S_PV.2977(PartII)(closed-resumption3)-EN",
           "S_PV.2977(PartII)(closed-resumption4)-EN", "S_PV.2977(PartII)(closed-resumption5)-EN", "S_PV.3160_RU"]

num_docs = df_cleaned.shape[0]
counter = 0
for i in range(num_docs):
    if i % 100 == 0:
        print("Processing document {}/{}".format(i+1, num_docs))
    series = df_cleaned.iloc[i]
    meeting_num = series["meeting_num"]
    doc_name = series["doc_name"]
    speech_data = series["speech_data"]
    if doc_name in to_skip:
        continue
    select = (df_meetings["meeting_num"] == meeting_num)
    if select.sum() > 1:
        print("multiple matches", doc_name)
        df_meetings_select = df_meetings[select]
        select2 = (df_meetings_select["record"] == doc_name.replace("_", "/"))
        if select2.sum() == 0:
            print("in the end, no match", doc_name)
        else:
            match = df_meetings_select[select2].iloc[0]
            df_meetings.loc[match.name, "speeches"] = speech_data
            counter += 1
    elif select.sum() == 0:
        print("no match", doc_name)
    else:
        match = df_meetings[select].iloc[0]
        df_meetings.loc[match.name, "speeches"] = speech_data
        counter += 1
print("{} rows have been updated.".format(counter))

In [None]:
# Check the updated meetings data
df_meetings.head(10)

In [None]:
df_meetings.speeches.sum()

In [None]:
# Sort the meetings data in ascending order by record_id
df_meetings_sorted = df_meetings.sort_values("record_id")
df_meetings_sorted.reset_index(drop=True, inplace=True)
df_meetings_sorted.head(20)

In [None]:
df_meetings_sorted.tail(20)

In [None]:
df_meetings_sorted.shape

In [None]:
# Save the sorted meetings data
out_path = r"PATH_TO_MEETINGS_FILE/meetings.tsv"
df_meetings_sorted.to_csv(out_path, sep='\t', na_rep='NULL')

### Merging Speech Data

In [None]:
# Load the old speech data
import pandas as pd

# Replace with the old speeches file path
in_path = r"PATH_TO_OLD_SPEECHES/speeches_old.tsv"
df_speeches1 = pd.read_csv(in_path, na_values='NULL', index_col=0, sep='\t')
df_speeches1.head(10)

In [None]:
df_speeches1.shape

In [None]:
df_speeches1.tail()

In [None]:
# Load the new speech data
in_path = r"PATH_TO_OUTPUT_DF/speeches_new.tsv"
df_speeches2 = pd.read_csv(in_path, na_values='NULL', index_col=0, sep='\t')
df_speeches2.head(10)

In [None]:
df_speeches2.shape

In [None]:
# Concatenate and sort the old and new speech data
speeches_combined = pd.concat([df_speeches1, df_speeches2], ignore_index=True)
speeches_combined.sort_values(['speech_id', 'year', 'month', 'day'], inplace=True)
speeches_combined.reset_index(inplace=True, drop=True)
speeches_combined.head(10)

In [None]:
speeches_combined.tail(10)

In [None]:
# Check the shape of the combined speeches data
speeches_combined.shape

In [None]:
# Save the combined speech data
out_path = r"PATH_TO_SPEECH_OUTPUT/speeches.tsv"
speeches_combined.to_csv(out_path, sep='\t', na_rep='NULL')

In [None]:
len(speeches_combined.country.unique())