 ## Python code for extracting and preprocessing UNSC speech transpripts

### Procedures for speech extraction

In [None]:
# Parsing and cleaning UNSC speech records
import re

def clean_PV(path, form="pdf"):
    """
    Arguments:
    - path (string): path to an input plain text file (converted from either MS doc file or PDF)
    - form (string): format of the original file obtained from UN Document system ('doc': MS Word, 'pdf': PDF)
    
    Returns:
    - president (tupl): the counbcil's president name ('string') and country ('string')
    - agenda (string): meeting agenda
    - text (string): cleaned text
    - num_words (integer): # of words contained in the text
    - num_paras (integer): # of paragraphs contained in the text
    """
    
    f = open(path, 'r')
    lines = f.readlines()
    num_lines = len(lines)
    main = False
    search_president = True
    search_agenda = [True, False]
    agenda_para = 0
    pdf_correction1 = [True, False]
    text_to_move = []
    pdf_correction2 = False
    
    president = []
    agenda = ''
    processed = []
    
    reg0 = re.compile(r"^(adoption of the agenda|expression of.+|opening statement).*$")
    reg1 = re.compile(r"^.+(\.{1}|\?|\")$")
    reg2 = re.compile(r"^.+:.*$")
    reg3 = re.compile(r"[\t\r\n\f]")
    reg4 = re.compile(r"\s\s+")
    reg5 = re.compile(r"^(\d+|\d+/\d+.{0,1}|\d{2}/\d{2}/\d{4}|\d{2}-\d{5}.*|S/PV\.\s*\d{4})$")
    reg6 = re.compile(r"^(\d{4}\D{2}\smeeting|Security Council|(For|Fif|Six|Sev)\D+\syear|\d+\s\S+\s\d{4})$")
    reg7 = re.compile(r"^\(((mr|mrs|ms|dr|sir|lord).+,.+|the president|the secretary-general|the secretary general)\)$")
    reg8 = re.compile(r"^(mr|mrs|ms|dr|sir|lord|the president|the secretary-general|the secretary general).*:.*$")
    reg9 = re.compile(r"^(mr|mrs|ms|dr|sir|lord|the president|the secretary-general|the secretary general).*$")
    reg10 = re.compile(r"^(arabic|russian|chinese|french|spanish|english).+$")
    reg11 = re.compile(r"^\*(\s|\*)+$")
    
    for i, line in enumerate(lines):
        
        line = str(line).strip()
        
        if len(line) == 0:
            continue
        
        if not main:
            
            reg = re.compile(r"^the meeting (was called to order|resumed|was resumed|was suspended).+$")
            if reg.match(line.strip('\t').lower()):
                main= True
                continue
            
             if search_president:
                reg = re.compile(r"^(mr|mrs|ms|dr|sir|lord).+$")
                if reg.match(line.lower()):
                    reg = re.compile(r"\.(\s\.)+")
                    pres_name = reg.sub('', line).replace('later', '').strip()
                    president.append(pres_name)
                    continue
                reg = re.compile(r"^\(.+\)$")
                if reg.match(line):
                    pres_country = line.replace('(', '').replace(')', '').strip()
                    president.append(pres_country)
                    search_president = False
                    continue
            
            if search_agenda[0]:
                if line.lower() == 'agenda':
                    search_agenda[1] = True
                    continue
                if search_agenda[1]:
                    agenda = line
                    if agenda.isupper():
                        agenda = agenda.title()
                    search_agenda[0] = False
                    continue

            continue
        
        reg = re.compile(r"^the meeting (rose|was suspended).+$")
        if reg.match(line.lower()) or (i >= num_lines - 1):
            break
            
        if form == 'pdf':
            
            if pdf_correction1[0]:
                reg = re.compile(r"^the president.*:")
                if reg0.match(line.lower()) or reg.match(line.lower()):
                    pdf_correction1[0] = False
                elif (len(line) > 0) and (not pdf_correction1[1]):
                    pdf_correction1[1] = True
            
            if not pdf_correction2:
                if line == 'The Security Council will now':
                    pdf_correction2 = True
                    line_to_add = len(processed)
            else:
                reg = re.compile(r"^(begin|resume)$")
                if reg.match(line):
                    processed[line_to_add] = (processed[line_to_add] + ' ' + line)
                    continue
                elif line == 'its':
                    processed[line_to_add] = (processed[line_to_add] + ' ' + line)
                    pdf_correction2 = False
                    continue
            
            page_break = False
            if (reg5.match(line)) or (reg6.match(line)) or (reg7.match(line.lower())):
                page_break = True
            elif (agenda_para > 1) and (line.lower() in agenda.lower()):
                page_break = True
            elif (agenda_para == 0) and ('resumption' in path.lower()) and (line.lower() in agenda.lower()):
                page_break = True
            if page_break:
                if pdf_correction1[1] and (len(text_to_move) > 0):
                    processed.extend(text_to_move)
                    text_to_move = []
                    pdf_correction1[1] = False
                continue
        
        if search_agenda[0]:
            if 'the agenda was adopted' in line.lower():
                search_agenda[1] = True
            elif search_agenda[1]:
                agenda = line
                if agenda.isupper():
                    agenda = agenda.title()
                search_agenda[0] = False
        
        if line[0].isupper() and reg0.match(line.lower()):
            line = '--para--' + line + '--para--'
          
        else:
            
            if reg1.match(line):
                line = line + '--para--'
            
            if reg2.match(line.strip('--para--')):
                 if (line[0].isupper()) and (reg8.match(line.lower())):
                    line = '--para--' + line
                    if len(processed) > 0:
                        previous = processed[-1]
                        if not previous.endswith('\n'):
                            processed[-1] = previous + '\n'
                
                elif len(processed) > 0:
                    previous = processed[-1]
                    if reg9.match(previous.lower()):
                        processed[-1] = '\n' + previous
                        if len(processed) > 1:
                            preceding = processed[-2]
                            if not preceding.endswith('\n'):
                                processed[-2] = preceding + '\n'
        
        if agenda_para == 0:
            if 'adoption of the agenda' in line.lower():
                agenda_para += 1
        elif agenda_para == 1:
            if reg8.match(line.lower().strip('--para--')):
                agenda_para += 1
            elif not line.endswith('--para--'):
                if i + 1 < num_lines:
                    next_line = lines[i+1].strip()
                    if (len(next_line) > 0) and next_line[0].isupper():
                        line = line + '--para--'
                    elif len(next_line) == 0:
                        line = line + '--para--'
        
        if reg11.match(line):
            continue
        
        line = reg4.sub(' ', reg3.sub(' ', line))
        line = line.strip().replace('--para--', '\n')
        if len(line) == 0:
            continue
        
        if (form == 'pdf') and pdf_correction1[0]:
            text_to_move.append(line)
            continue
        
        processed.append(line)
    
    text = ' '.join(processed).replace('\n ', '\n').replace(' \n', '\n').strip()
    num_words = len(text.split())
    num_paras = len(text.split('\n\n'))
    return president, agenda, text, num_words, num_paras

In [None]:
# Extraction of speech texts and other information from a single cleaned meeting record
import re

def speech_extraction(path, president_as_delegate=True, remove_quotes=True, speaker_thres=5):
    """
    Arguments:
    - path (string): path to an input text file that contains a cleaned speech record, i.e., output of clean_PV()
    - president_as_delegate (boolean): if True, presidents' procedural statements are discarded
    - remove_quotes (boolean): if True, all in-speech quotes are removed
    - speaker_thres (integer): a threshold number of tokens for detecting a speaker's name
    
    Returns:
    - records (2d list): each row consists of information on a single speech, which includes:
      - speaker (string): the speaker's name
      - country (string): the country represented by the speaker
      - speech (string): speech text
      - count (integer): word count of the speech
    """
    
    f = open(path, 'r')
    text = f.read()
    paras = text.split('\n\n')

    speakers = []
    countries = []
    speeches = []
    records = []

    reg0 = re.compile(r"^(mr|mrs|ms|dr|sir|lord|the president|the secretary-general|the secretary general).*:.*")
    reg1 = re.compile(r"(in my capacity as|in my national capacity|resume my functions as)")
    reg2 = re.compile(r"“[^“\n]+”")
    reg3 = re.compile(r"“[^”\n]+\n")
    reg4 = re.compile(r"\s—\s")
    reg5 = re.compile(r"\s\s+")
    reg6 = re.compile(r"\s")
    reg7 = re.compile(r"\"[^\"\n]+\"")
    reg8 = re.compile(r"\"[^\"\n]+\n")

    for para in paras:
        para = para.strip()
        speaker = "NULL"
        country = "NULL"
        speech = "NULL"
        if reg0.match(para.lower()):
            parts = para.split(':')
            head = parts[0]
            speech = (':'.join(parts[1:])).strip()

            if remove_quotes:
                speech = reg5.sub(' ', reg4.sub(' ', reg3.sub('', reg2.sub('', speech))))
                speech = reg5.sub(' ', reg4.sub(' ', reg8.sub('', reg7.sub('', speech))))
    
            part = head.split('(')[0].strip()
            if len(part.split()) <= speaker_thres:
                speaker = part.strip()
                country = 'n.a.'
                if ('(' in head) and (')' in head):
                    country = head.split('(')[1].split(')')[0].strip()
            else:
                if len(speeches) > 0:
                    previous = speeches[-1]
                    if previous != "NULL":
                        speeches[-1] = (previous + ' ' + para)
                continue
                
            if (speaker.lower() == 'mr. president') or (speaker.lower() == 'ms. president'):
                speaker = 'The President'
            
            if speaker.lower() == 'the president' and president_as_delegate:
                if not reg1.search(reg5.sub(' ', speech.replace('\n', ' ').lower())):
                    speaker = "NULL"
                    country = "NULL"
                    speech = "NULL"
        
        speakers.append(speaker)
        countries.append(country)
        speeches.append(speech)

    num_speakers = len(speakers)
    for i in range(num_speakers):
        speaker = speakers[i].title()
        country = countries[i]
        if speakers[i].upper() == "NULL":
            continue
        if speaker.lower() == 'the president':
            country = 'n.a.'
        elif (country.lower().startswith('spoke')) or (country.lower().startswith('interpretation')):
            country = 'n.a.'
        speech = speeches[i]
        count = len(reg5.sub(' ', reg6.sub(' ', speech)).split())
        if count == 0:
            continue
        row = [speaker, country, speech, count]
        records.append(row)
    
    return records

### Procedures for text preprocessing
Input  
speeches (list): a list of extracted speeches  
Output  
corpus_final (list): a list of bags of preprocessed POS-tagged tokens

In [None]:
# part-of-speech (POS) tagging
import nltk
from nltk import tokenize

tagged_speeches = []
for i, speech in enumerate(speeches):
    if i % 1000 == 0:
        print('processing doc %i...' % i)
    tagged = nltk.pos_tag(tokenize.word_tokenize(speech))
    tagged_speeches.append(tagged)
print('%i docs have been tagged.' % len(tagged_speeches))

In [None]:
# a standard menu for text preprocessing: 
# bag-of-words conversion, lower-case conversion, stopword removal...
import re
from nltk import tokenize
from nltk.corpus import stopwords
stops = stopwords.words('english')
reg = re.compile(r'[^a-zA-Z_0-9\-]')

tagged_corpus = []
for i, tagged in enumerate(tagged_speeches):
    if i % 5000 == 0:
        print("processing doc %i..." % i)
    processed = []
    for token, pos in tagged:
        token = reg.sub('', token.lower())
        if (len(token) > 0) & (token not in stops):
            processed.append((token, pos))
    tagged_corpus.append(processed)
print("%i docs have been processed." % len(tagged_corpus))

In [None]:
# Spelling conversion from British to American spelling

# file paths of the spelling lists
path1 = 'supplements/british.txt'
path2 = 'supplements/american.txt'

with open(path1, 'r') as f:
    british = f.read().replace(' ', '').split('\n')
with open(path2, 'r') as f:
    american = f.read().replace(' ', '').split('\n')
spell_dict = dict(zip(british, american))

tagged_corpus2 = []
count = 0
for i, tagged in enumerate(tagged_corpus):
    if i % 5000 == 0:
        print("processing doc %i..." % i)
    processed = []
    for token, pos in tagged:
        if token in british:
            processed.append((spell_dict[token], pos))
            count += 1
        else:
            processed.append((token, pos))
    tagged_corpus2.append(processed)
print("%i docs have been processed." % len(tagged_corpus2))
print("%i words have been converted." % count)

In [None]:
# Combining each word and its POS tag with a colon
tagged_corpus3 = []
for i, tagged in enumerate(tagged_corpus2):
    if i % 5000 == 0:
        print("processing doc %i..." % i)
    processed = []
    for word, pos in tagged:
        token = ':'.join([word, pos])
        processed.append(token)
    doc = ' '.join(processed)
    tagged_corpus3.append(doc)
print("%i docs have been processed." % len(tagged_corpus3))

In [None]:
# Starting phrase detection...
from gensim.models.word2vec import LineSentence
from gensim.models.phrases import Phrases, Phraser

corpus_uni = []
for line in tagged_corpus3:
    corpus_uni.append(line.strip().split(" "))
print("%i docs have been converted." % len(corpus_uni))

In [None]:
# First iteration: generating bigrams
min_count = 1000
thres = 5.0

phrases_bi = Phrases(corpus_uni, min_count=min_count, threshold=thres)
bigram = Phraser(phrases_bi)
transformed_bi = bigram[corpus_uni]

corpus_bi = []
for bi in transformed_bi:
    corpus_bi.append(bi)
len(corpus_bi)

In [None]:
# Second iteration: generating trigrams
min_count = 1000
thres = 5.0

phrases_tri = Phrases(corpus_bi, min_count=min_count, threshold=thres)
trigram = Phraser(phrases_tri)
transformed_tri = trigram[corpus_bi]

corpus_final = []
for tri in transformed_tri:
    corpus_final.append(tri)
len(corpus_final)