<a href="https://colab.research.google.com/github/wannli/notebooks/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [60]:
# on boot
!pip install -q pdfminer.six
!pip install -q pygsheets
import pygsheets
# !pip install -U spacy[cuda]
import pandas as pd
import numpy
import os, random, fnmatch
import pdfminer
import spacy
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.high_level import extract_text
from io import StringIO
import re
import seaborn as sns
import fnmatch,os
import functools, random, sys, time
from tqdm.notebook import tqdm

from google.colab import drive
drive.mount('/content/drive')

%load_ext google.colab.data_table

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


# Text extraction model

## Define text extraction functions

In [56]:
# Function that converts PDFs to text files
# These margins tell the parser how the PDF file is structured, i.e. how to identify charcters, words, and lines
# The only tweaks that need to be made in this function, if at all, are to these margins. 
# See pdfminer.six documentation here: https://pdfminersix.readthedocs.io/_/downloads/en/latest/pdf/

def convert_pdf_to_txt(path):
    resource_manager = PDFResourceManager()
    device = None
    try:
        with StringIO() as string_writer, open(path, 'rb') as pdf_file:
            device = TextConverter(resource_manager, string_writer, codec='utf-8',
                                   laparams=pdfminer.layout.LAParams(line_margin=3.0, word_margin = 3.0, char_margin=30.0)) 
                                                                                                       
            interpreter = PDFPageInterpreter(resource_manager, device)
            for page in PDFPage.get_pages(pdf_file):
                interpreter.process_page(page)
            pdf_text = string_writer.getvalue()
    finally:
        if device:
            device.close()
    return pdf_text

# Function that removes boilerplate text that is not part of the meeting. Is a sub-function of the function clean()
def remove_boilerplate(text):
    text = text.replace("This record contains the text of speeches delivered in English and of the translation of speeches","")
    text = text.replace("delivered in other languages. Corrections should be submitted to the original languages only.","")
    text = text.replace("They should be incorporated in a copy of the record and sent under the signature of a member","")
    text = text.replace("of the delegation concerned to the Chief of the Verbatim Reporting Service, room U-0506","")
    text = text.replace("(verbatimrecords@un.org). Corrected records will be reissued electronically on the Official","")
    text = text.replace("Document System of the United Nations (http://documents.un.org).","")
    text = text.replace("This record contains the text of speeches delivered in English and of the interpretation of","")
    text = text.replace("speeches delivered in the other languages. Corrections should be submitted to the original","")
    text = text.replace("languages only. They should be incorporated in a copy of the record and sent under the signature","")
    text = text.replace("of a member of the delegation concerned to the Chief of the Verbatim Reporting Service, room","")
    text = text.replace("U-506. Corrections will be issued after the end of the session in a consolidated corrigendum.","")
    text = text.replace("In accordance with decision 74/562, and without setting a precedent for mandated high-level","")
    text = text.replace("meetings planned for future high-level weeks, the official records of the General Assembly","")
    text = text.replace("will be supplemented by annexes containing pre-recorded statements submitted by Heads","")
    text = text.replace("of State or other dignitaries, submitted to the President no later than the day on which such","")
    text = text.replace("statements are delivered in the Assembly Hall. Submissions in this regard should be made to","") 
    text = text.replace("estatements@un.org.","")
    text = text.replace("C-154A. Corrections will be issued after the end of the session in a consolidated corrigendum.","") 
    text = text.replace("asdf","")
    text = text.replace("  "," ")
    
    return text

# Function that searches a line of text to see if they contain regular expressions that trigger a new paragraph. Is a sub-function of the function clean()
def search_using_regex(line,regex_tuple):
    result = None
    for i in regex_tuple:
        if re.search(i,line):
            result = re.search(i,line)
            break
    return result

# Function that reformats a line of text to be part of a paragraph, by parsing for full stops:".". Is a sub-function of the function clean()
# Some cases, like "Mr." or "St." or "V." (someone's intitials) or "a.m." are not the end of a sentence. These bad line breaks are removed as long as they are listed in the tuple outliers
def split_by_sentence(line):
    line = line.replace("\n\n", "")
    line = line.replace(". ", ".\n")
    line = line.replace(".  ", ".\n")
    line = line.replace('.” ', '.”\n')
    line = line.replace('.”  ', '.”\n')
    line = line.replace('.\] ', '.\]\n')
    line = line.replace('.\]  ', '.\]\n')
    line = line.replace("? ", "?\n")
    line = line.replace("?  ", "?\n")
    if line.lstrip()[-1] == "." or line.lstrip()[-1] == "?":
        line = line + "\n"
    outliers = (r"([A-Z][a-zA-z]?\.)", "Mrs.", r", para\.", "p.m.", "a.m.", "Messrs.", "Mmes.")
    bad_line_breaks = []
    for i in outliers:
        bad_line_breaks += re.findall(i, line)
    for i in bad_line_breaks: 
        line = line.replace(i+"\n",i+" ")
    return line

# Function that cleans up the raw text and organizes it into paragraphs
def clean_pv(text):
    text = remove_boilerplate(text)     
    
    # This section of the function parses the text line by line to mark where paragraphs begin and end
    cleaned_text = ""
    cleaned_list = []
    line_list = text.split("\n")
    #print(line_list)
    record_name = ""
    for line in line_list:
        if re.search(r"[A-Z](.*)?\/[0-9][0-9]\/.*[0-9]$",line):
            record_name = line[line.find("A"):].strip()
            break
    # The lists of breakers below tell the parser to trigger a new paragraph when found
    breakers = ("The meeting rose at", "Agenda item", "Annex", "In the absence of the President,", 
                "A pre-recorded video statement was shown", "The meeting was suspended", "Address by"
               )
    breakers2 = ("The President:", "The Temporary President:", "The Acting President:", "The Chair:",
                 r"The President \(",r"The Acting President \(",r"The Chair \(", "A recorded vote", 
                 "In favour:", "Against:", "Abstaining:", r"\[Subsequently", "Number of ballot papers:", 
                 "Number of ballot papers", "Number of invalid ballots:", "Number of valid ballots:", 
                 "Abstentions:", "Number of members present and voting:","Number of members voting:", 
                 "Required majority:", "Required absolute majority:","Number of votes obtained:",
                 "Having obtained the required majority", "Required simple majority:", "It was so decided.",
                 "A vote was taken by secret ballot"
                )
    breakers3 = (r"Mr\.((.*\()|(.*:))",r"Ms\.((.*\()|(.*:))",r"Mrs\.((.*\()|(.*:))",r"Monsignor((.*\()|(.*:))",
                 r"The (Deputy )?Secretary( |-)General((.*\()|(:))", r"Prince((.*\()|(.*:))", r"King((.*\()|(.*:))", 
                 r"Queen((.*\()|(.*:))", r"Princess((.*\()|(.*:))", r"President((.*\()|(.*:))", 
                 r"Prime Minister((.*\()|(.*:))", r"Judge((.*\()|(.*:))"
                )
    breakers4 = (r"\([a-z]\) [A-Z]", r"\([a-z][a-z]\) ", r"Group [A-Z] —")
    breakers5 = (r"(The )?[dD]raft resolution(.*)?was adopted", r"(The )?[dD]raft resolution(.*)?was rejected", r"(The )?[dD]raft resolution(.*)?was withdrawn",
                 r"(The )?[dD]raft decision(.*)?was adopted", r"(The )?[dD]raft decision(.*)?was rejected", r"(The )?[dD]raft decision(.*)?was withdrawn",
                 r"(The )?([dD]raft )?amendment(.*)?was adopted", r"(The )?([dD]raft )?amendment(.*)?was rejected", r"(The )?([dD]raft )?amendment(.*)?was withdrawn",
                 r"(The )?([dD]raft )?motion(.*)?was adopted", r"(The )?([dD]raft )?motion(.*)?was rejected", r"(The )?([dD]raft )?motion(.*)?was withdrawn", 
                 r"The ([dD]raft )?oral amendment|decision(.*)?was adopted", r"The ([dD]raft )?oral amendment|decision(.*)?was rejected", 
                 r"The ([dD]raft )?oral amendment|decision(.*)?was withdrawn", r"The.*paragraph(s)?.*(was|were) (not )?(retained|included)",
                 r"The.*paragraph(s)?",r"The draft (resolution|amendment|decision) as a whole"
                )
    titles = ("Mr.", "Ms.", "Mrs.")
    strip_keys = (r"[a-zA-Z]",r"\.",r"^[0-9][0-9]?[0-9]?$")
    for line in line_list:
        line = line.lstrip()
        line = line.rstrip()
        line = line.replace(record_name,"")
        if line.strip():
            if search_using_regex(line, strip_keys):                                # The first three if statements remove blank lines, headers, footers, etc.
                search_breakers2 = search_using_regex(line,breakers2)
                search_breakers3 = search_using_regex(line,breakers3)
                search_breakers4 = search_using_regex(line,breakers4)
                search_breakers5 = search_using_regex(line,breakers5)
                if line.startswith(breakers):                             # New paragraph if a line of text contains certain phrases like "Agenda item..."
                    line = "\n\n" + line 
                elif search_breakers2 and search_breakers2.start() == 0:  # New paragraph if a line of text is the start of the presiding officer's statements   
                    line = "\n\n" + line
                elif search_breakers3 and search_breakers3.start() == 0:  # New paragraph if a line of text is part of a delegate's statement
                    line = "\n\n" + line   
                elif search_breakers4 and search_breakers4.start() == 0:  # New paragraph if a line of text is an agenda sub-item like "(a)" or "(b)"                    
                    line = "\n\n" + line 
                elif search_breakers5 and search_breakers5.start() == 0:  # New paragraph if a line of text is an agenda document or a vote result on a draft resolution/amendment/decision                
                    line = "\n\n" + line 
                else:                                                     # A line of text belongs to the middle or end of a paragraph
                    line = "\n" + line 
                cleaned_list.append(line)
    
    # This section of the function actually splits up the text by paragraph based on the markers placed in the previous section
    meeting_called = False                                                        # False until the meeting is called to order. The meeting info text at the start of a meeting record needs to be formatted differently than the rest of the document 
    split_sentences = False                                                       # False if a statement needs all sentences to be lumped into one line. True if each sentence of a statement is a new line
    meeting_segment = False                                                       # False if a line is not part of an agenda item heading. True if it is
    bad_break_alert = False                                                       # This checks to see whether the words in breakers3 like "Mr./Ms./Mrs." are used incidentally or to signal that a new speaker is making a statement
    #print(cleaned_list)
    for line in cleaned_list:                                                     
        if meeting_called:
            search_breakers2 = search_using_regex(line.lstrip(),breakers2)
            search_breakers3 = search_using_regex(line.lstrip(),breakers3)
            search_breakers4 = search_using_regex(line.lstrip(),breakers4)
            search_breakers5 = search_using_regex(line.lstrip(),breakers5)
            if bad_break_alert and split_sentences:
                if line.find(":") == -1:
                    split_sentences = True
                else:
                    split_sentences = False
                bad_break_alert = False
            if line.lstrip().startswith(breakers):                                # These if statements all correspond to the ones above that denote new paragraphs. These ones implement the paragraph splitting
                if line.lstrip().startswith("Agenda item"):
                    meeting_segment = True
                line = line.replace("\n"," ")
                line = "\n\n" + line.lstrip() + " "
                split_sentences = False
            elif search_breakers2 and search_breakers2.start() == 0:              # Presiding Officers have their statements split into a new line for each sentence
                meeting_segment = False
                line = line.replace("\n"," ")
                line = "\n\n" + line.lstrip() + " "
                split_sentences = True
            elif search_breakers3 and search_breakers3.start() == 0:              # Delegates have their statements lumped into one line
                meeting_segment = False
                if split_sentences and (line.find("), M") != -1 or line.find(") and M") != -1 or line.find("), \n") != -1):
                    line = line.replace("\n"," ") + " "
                    line = split_by_sentence(line) 
                else:
                    line = line.replace("\n"," ")
                    line = "\n\n" + line.lstrip() + " "
                if line.find(":") == -1:
                    bad_break_alert = True
                else:
                    split_sentences = False
            elif search_breakers4 and search_breakers4.start() == 0:
                if meeting_segment:
                    line = line.replace("\n"," ") + " "
                else:
                    line = line.replace("\n"," ")
                    line = "\n\n" + line.lstrip() + " "
                split_sentences = False
            elif search_breakers5 and search_breakers5.start() == 0:
                if split_sentences:
                    line = line.replace("\n"," ")
                    line = "\n\n" + line.lstrip() + " "
                else:
                    line = line.replace("\n"," ") + " "
            else:
                line = line.replace("\n"," ") + " "
                if split_sentences:
                    line = split_by_sentence(line) 
            if (line.find("(Department for General") > -1                # DGACM reps who speak at the meeting have their statements split by sentence
                and line.lstrip().startswith(titles)
               ): 
                bad_break_alert = False
                split_sentences = True
            cleaned_text = cleaned_text + line.lstrip(" ")
        else:
            if line.lstrip().startswith("In the absence of the"):
                line = line.replace("\n","") + " "
            else:
                line = line.replace("\n","")
                line = line + "\n"
            line = line.replace("took\n","took ")
            line = line.replace("took the\n","took the ")
            line = re.sub(r"[A-Z]\/[0-9][0-9]\/.*[0-9]$","",line)
            cleaned_text = cleaned_text + line  
        if line.lstrip().startswith("The meeting was called to order at"):        
            meeting_called = True
    cleaned_text = cleaned_text.replace("\n\n\n\n","\n\n")
    cleaned_text = cleaned_text.replace("\n\n\n","\n\n")
    cleaned_text = cleaned_text.replace("\n\n","\n")
    cleaned_text = cleaned_text.replace("and \n","and ") 
    cleaned_text = cleaned_text.replace("- ","-")
    return cleaned_text 

## Source PDFs and extract the text

- Choose source on the right. Entire session or a random sample.

In [None]:
#@title { form-width: "20%" }
mode = "random" #@param ["random", "session"]
random_size = 10 #@param {type:"integer"}
session =  71#@param {type:"integer"}

p = '/content/drive/MyDrive/Intergov data project/pdf'

if mode == "session": # switch to False if you want random files.
  PVs = fnmatch.filter(os.listdir(p),'A?'+str(session)+'*.pdf')
else:
  PVs = [random.choice(os.listdir(p)) for x in range (1,random_size+1)]

for selected_PV in tqdm(PVs):  
  # Creates a text file and writes the pdf text into it  
  text = convert_pdf_to_txt(p+'/'+selected_PV) 
  path = '/content/'+selected_PV
  with open(path[:-4] + ".txt", "w", encoding='utf-8') as file:
    file.write(clean_pv(text))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




## One-time conversion of all PDFs

In [None]:
#@title { form-width: "20%" }
run = False #@param {type:"boolean"}

if run:

  p = '/content/drive/MyDrive/Intergov data project/pdf/'
  q = '/content/drive/MyDrive/Intergov data project/txt/'

  PVs = os.listdir(p)

  for selected_PV in tqdm(PVs):  
    # Creates a text file and writes the pdf text into it  
    text = convert_pdf_to_txt(p+selected_PV) 
    path = q+selected_PV
    with open(path[:-4] + ".txt", "w", encoding='utf-8') as file:
      file.write(clean_pv(text))

# Text classification model

## Import training data

In [None]:
reviews=pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSe_6IqJaZagNg34cE2cidkjnYQKJ12qGHU7EmlyTWCJIn-FdgP4FNDuaV1232n2XJZrSPK1TJUs6_Y/pub?gid=0&single=true&output=csv")
# reviews.dropna(inplace=True)
reviews['tuples'] = reviews.apply(lambda row: (row['text'],row['train']), axis=1)
train = reviews['tuples'].tolist()

## Build the model

In [None]:
nlp=spacy.load("en_core_web_sm")

# Adding the built-in textcat component to the pipeline.
textcat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

# Adding the labels to textcat

labels_=pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ_JfnrLsFfoj9LzZZegxzJdVBHGD3iWR3MwQglCLXDgm-pUB5Zv1oi5yx09brBD48Y74rRObGx-JmC/pub?gid=0&single=true&output=csv")
labels_.dropna(inplace=True)
pST = labels_['Labels'].tolist()
for l in pST:
  textcat.add_label(l)

## Prepare training data

In [None]:
import random

def load_data(limit=0, split=0.8):
    train_data=train
    # Shuffle the data
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    # get the categories for each review
    # cats = [{y: True} for y in labels]
    
    cats = []
    for l in labels:
      dict = {}
      for li in pST:
          if l==li:
            dict.update({li: True})
          else:
            dict.update({li: False}) # pST is mutually exclusive
      cats.append(dict)    

    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))

## Set functions for precision, recall and f-score

In [None]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0   # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0   # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


#("Number of training iterations", "n", int))
n_iter=10

## Train the model

In [None]:
from spacy.util import minibatch, compounding

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training() # initialize the model weights randomly

    

    # Performing training
    for i in tqdm(range(n_iter)):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        if i == 0:
          print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

LOSS 	  P  	  R  	  F  
10.894	0.899	0.794	0.844
4.319	0.920	0.901	0.910
2.223	0.910	0.893	0.901
1.669	0.907	0.896	0.902
1.571	0.904	0.904	0.904
1.213	0.897	0.906	0.902
1.003	0.894	0.898	0.896
1.203	0.894	0.896	0.895
1.113	0.891	0.893	0.892
1.133	0.889	0.893	0.891



## Apply the model

In [None]:
#@title { form-width: "20%" }
mode = "local" #@param ["local", "random", "session"]
random_size = 10 #@param {type:"integer"}
session =  71#@param {type:"integer"}

q = '/content/drive/MyDrive/Intergov data project/txt/'

if mode == "local":

  path = "/content/"
  files = os.listdir(path)

elif mode == "random":
  
  path = q
  files = [random.choice(os.listdir(q)) for x in range (random_size)]

else: # mode == "session"
  
  path = q
  files = fnmatch.filter(os.listdir(q),'A?'+str(session)+'*.pdf')

l = []
j = 1

for file in tqdm(files): # we iterate through all files
    if file.endswith(".txt"):
      f = open('/content/'+file, 'r')
      c = f.read()
      lines = c.splitlines()
      i = 0
      for line in lines: # and predict the label for each sentence
        doc=nlp(line)
        stats = doc.cats
        prediction=max(stats, key=stats.get)
        prob=max(stats.values())
        l.append({"file": file, "line": i, "prob": prob, "pred": prediction, "text": line, "noise": doc.cats})
        i+=1

# store as df
df = pd.DataFrame(l)


In [None]:
# We extract second-highest probability and the difference between both probabilities for further analysis

import math
import numpy as np

o = df.round(2)

o_noise = o['noise'].apply(pd.Series).round(2)
o_noise.replace(0, np.nan, inplace=True)

json = o_noise.to_dict('records',)
json_clean = json_2nd []

for r in json:
  delete = [k for k in r if math.isnan(r[k]) or r[k] == 1 or r[k] < 0.11]
  for k in delete:
    del r[k]
  r_sorted = dict(sorted(r.items(),key=lambda item: item[1],reverse=True))
  json_clean.append(r_sorted)
  
o['noise'] = json_clean # update 'noise' with clean prediction list

second = []
for r in o['noise']:
  if len(r) > 1:
    second.append(sorted(r.values())[-2])
  else:
    second.append(0)

o['prob_2nd'] = second # include second-highest probability
o['prob_diff'] = o['prob']-o['prob_2nd'] # include difference between highest and second-highest probability
o['prob_diff'] = o['prob_diff'].round(2)

df = o

# Named entity recognition model

## Build the rule-based model

In [None]:
# Colab runs on Spacy 2.x. That's OK.

import spacy
import pandas as pd
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.language import Language

# Default tokenizer
nlp = spacy.load("en_core_web_sm")

# Modify tokenizer infix patterns
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS) # to avoid splitting on hyphens
        # r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), # to avoid splitting UN doc symbols, replaced with below
        r"(?<=[{a}0-9])[:<>=](?=[{a}])".format(a=ALPHA)
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

### Long list of patterns

In [None]:
patterns = [
             {"label": "Entitled", "pattern": [
                                        {"ORTH" : "entitled", "OP": "?"},
                                        {"TAG" : "``"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"OP": "?"},
                                        {"TAG" : "''"}
                                        ]},
            {"label": "Vote", "pattern": [
                                        {"TEXT" : {"REGEX": "Abstaining|Against"}},
                                        {"TEXT" : ":"}
                                        ]},
            {"label": "Vote", "pattern": [
                                        {"TEXT" : "In"},
                                        {"TEXT" : "favour"},
                                        {"TEXT" : ":"}
                                        ]},
            {"label": "Vote_outcome", "pattern": [
                                        {"TEXT" : {"REGEX": "[rejected|adopted|retained]"}, "OP": "?"},
                                        {"TEXT" : "by", "OP": "?"},
                                        {"LIKE_NUM" : True},
                                        {"TEXT" : "votes"},
                                        {"TEXT" : "to"},
                                        {"LIKE_NUM" : True},
                                        {"OP": "?"},
                                        {"TEXT" : "with", "OP": "?"},
                                        {"LIKE_NUM" : True, "OP": "?"},
                                        {"TEXT" : "abstentions", "OP": "?"}
                                        ]},
            {"label": "Vote_outcome", "pattern": [
                                        {"TEXT" : {"REGEX": "[rejected|adopted|retained]"}, "OP": "?"},
                                        {"TEXT" : "by", "OP": "?"},
                                        {"LIKE_NUM" : True},
                                        {"TEXT" : "votes"},
                                        {"TEXT" : "to"},
                                        {"TEXT" : "none"},
                                        {"OP": "?"},
                                        {"TEXT" : "with", "OP": "?"},
                                        {"LIKE_NUM" : True, "OP": "?"},
                                        {"TEXT" : "abstentions", "OP": "?"}
                                        ]},
            {"label": "RPT", "pattern": [
                                       {"TEXT" : "Report"},
                                       {"TEXT" : "of"},
                                       {"TEXT" : "the"},
                                       {"TEXT" : {"REGEX": "First|Second|Third|Fourth|Fifth|Sixth|Credentials"}},
                                       {"TEXT" : "Committee"},
                                       {"TAG": "-LRB-"},
                                       {},
                                       {"TAG": "-RRB-"}
                                       ]},
            {"label": "RPT", "pattern": [
                                       {"TEXT" : {"REGEX": "[Rr]eport"}},
                                       {"TEXT" : {"REGEX": "of"}},
                                       {"TEXT" : "the"},
                                       {"TEXT" : "Secretary-General"},
                                       {"TAG": "-LRB-"},
                                       {},
                                       {"OP": "?"},
                                       {"OP": "?"},
                                       {"TAG": "-RRB-"}
                                       ]},
            {"label": "RPT", "pattern": [
                                       {"TEXT" : {"REGEX": "[Nn]ote"}},
                                       {"TEXT" : {"REGEX": "by"}},
                                       {"TEXT" : "the"},
                                       {"TEXT" : "Secretary-General"},
                                       {"TAG": "-LRB-"},
                                       {},
                                       {"OP": "?"},
                                       {"OP": "?"},
                                       {"TAG": "-RRB-"}
                                       ]},
            {"label": "C", "pattern": [
                                       {"TEXT" : {"REGEX": "First|Second|Third|Fourth|Fifth|Sixth|Credentials"}},
                                       {"TEXT" : "Committee"}
                                       ]},
            {"label": "PARA", "pattern": [
                                        {"TEXT" : {"REGEX": "[Pp][ara.|aragraph]"}},
                                        {"LIKE_NUM" : True},
                                        {"TEXT" : "to", "OP":"?"},
                                        {"LIKE_NUM" : True, "OP":"?"}                                        
                                        ]},
            {"label": "PARA", "pattern": [
                                        {"TEXT" : {"REGEX": "[Oo]perative"}},
                                        {"TEXT" : {"REGEX": "para.|paragraph"}},
                                        {"LIKE_NUM" : True}
                                        ]},
            {"label": "PARA", "pattern": [
                                        {},
                                        {"TEXT" : "preambular"},
                                        {"TEXT": "paragraph"}
                                        ]},
            {"label": "AI", "pattern": [
                                        {"TEXT" : {"REGEX": "[Aa]genda"}},
                                        {"TEXT" : {"REGEX": "item|items"}},
                                        {"TEXT" : {"REGEX": r"[0-9]{1,3}"}},
                                        {"TEXT" : "and", "OP": "?"},
                                        {"TEXT" : {"REGEX": r"^[0-9]{1,3}$"}, "OP": "?"}
                                        ]},
            {"label": "ASI", "pattern": [
                                        {"TEXT" : {"REGEX": "[Ss]ub-item"}, "OP": "?"},
                                        {"TAG": "-LRB-"}, # ( { [
                                        {"TEXT" : {"REGEX": r"^[a-z]{1,2}$"}},
                                        {"TAG": "-RRB-"},
                                        {"TEXT": "of", "OP": "?"},
                                        {"TEXT": "agenda", "OP": "?"},
                                        {"TEXT": "item", "OP": "?"},
                                        {"TEXT" : {"REGEX": r"^[0-9]{1,3}$"}, "OP": "?"}                                         
                                        ]},
            {"label": "AS", "pattern": [
                                        {"TAG": "-LRB-"},
                                        {"TEXT" : "continued"},
                                        {"TAG": "-RRB-"}
                                        ]},
            {"label": "DR", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "resolution"}},
                                        {"TEXT" : {"REGEX":r"^A/[0-9]{1,2}/L\.[0-9]{1,3}/"}}
                                        ]},
            {"label": "DR", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "resolution"}},
                                        {"TEXT" : {"REGEX":r"^A/[0-9]{1,2}/L\.[0-9]{1,3}"}}
                                        ]},
            {"label": "DD", "pattern": [
                                        {"TAG" : "CD"},
                                        {"TEXT" : "draft"},
                                        {"TEXT" : "decisions"}
                                        ]},
            {"label": "DR", "pattern": [
                                        {"TAG" : "CD"},
                                        {"TEXT" : "draft"},
                                        {"TEXT" : "resolutions"}
                                        ]},
            {"label": "DR", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "resolution"}},
                                        {"TEXT" : {"REGEX": r"^[I,V,X0-9]{1,7}$"}, "OP": "?"},
                                        {"TEXT" : "to", "OP": "?"},
                                        {"TEXT" : {"REGEX": r"^[I,V,X0-9]{1,7}$"}, "OP": "?"}
                                        ]},
            {"label": "DA", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "amendment"}},
                                        {"TEXT" : {"REGEX": r"^[I,V,X0-9]{1,7}$"}, "OP": "?"}
                                        ]},            
            {"label": "DOD", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "oral"}},
                                        {"TEXT" : {"REGEX": "decision"}},
                                        {"TEXT" : {"REGEX": r"[I,V,X0-9]{1,7}"}, "OP": "?"}
                                        ]},                        
            {"label": "R", "pattern": [
                                       {"TEXT" : {"REGEX": "[Rr]esolution"}},
                                       {"TEXT" : {"REGEX": r"[I,V,X0-9]{1,7}"}}
                                       ]},
            {"label": "R", "pattern": [
                                       {"TEXT" : {"REGEX": "[Rr]esolution"}},
                                       {"TEXT" : {"REGEX": r"[0-9]{1,2}/[0-9]{1,3}"}}
                                       ]},
            {"label": "D", "pattern": [
                                       {"TEXT" : {"REGEX": "[Dd]ecision"}},
                                       {"TEXT" : {"REGEX": r"[I,V,X0-9]{1,7}"}}
                                       ]},
            {"label": "D", "pattern": [
                                       {"TEXT" : {"REGEX": "[Dd]ecision"}},
                                       {"TEXT" : {"REGEX": r"[0-9]{1,2}/[0-9]{1,3}"}}
                                       ]},
            {"label": "DD", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "decision"}},
                                        {"TEXT" : {"REGEX": r"[I,V,X0-9]{1,7}"}, "OP": "?"},
                                        {"TEXT" : "to", "OP": "?"},
                                        {"TEXT" : {"REGEX": r"^[I,V,X0-9]{1,7}$"}, "OP": "?"}
                                        ]},
            {"label": "DD", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "decision"}},
                                        {"TEXT" : {"REGEX":r"A/[0-9]{1,2}/L\.[0-9]{1,3}"}}
                                        ]},
            {"label": "DD", "pattern": [
                                        {"TEXT" : {"REGEX": "[Dd]raft"}},
                                        {"TEXT" : {"REGEX": "decision"}},
                                        {"TEXT" : {"REGEX":r"A/[0-9]{1,2}/L\.[0-9]{1,3}/" }}
                                        ]},
            {"label": "DR_MOD", "pattern": [{"TEXT" : "as", "OP": "?"},{"TEXT" : "orally"},{}]},
            {"label": "DR_MOD", "pattern": [{"TEXT" : "as"},{"TEXT" : "a"},{"TEXT" : "whole"}]},
            {"label": "Symbol", "pattern": [{"TEXT": "Document", "OP":"?"},{"TEXT" : {"REGEX":r"^A/[0-9]{1,2}/[0-9]{1,3}/" }}]},
            {"label": "Symbol", "pattern": [{"TEXT": "Document", "OP":"?"},{"TEXT" : {"REGEX":r"^A/[0-9]{1,2}/[0-9]{1,3}"}}]},
            {"label": "L", "pattern": [{"TEXT": "Document", "OP":"?"},{"TEXT" : {"REGEX":r"^A/[0-9]{1,2}/L\.[0-9]{1,3}/" }}]},
            {"label": "L", "pattern": [{"TEXT": "Document", "OP":"?"},{"TEXT" : {"REGEX":r"^A/[0-9]{1,2}/L\.[0-9]{1,3}"}}]},
            {"label": "Symbol", "pattern": [{"TEXT": "Document", "OP":"?"},{"TEXT" : {"REGEX":r"^A/(AC|C)\.[0-9]{1,3}/[0-9]{1,3}"}}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Afghanistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Albania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Algeria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Andorra"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Angola"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Antigua"},{"TEXT":"and"},{"TEXT":"Barbuda"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Argentina"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Armenia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Australia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Austria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Azerbaijan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Bahamas"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Bahrain"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Bangladesh"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Barbados"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Belarus"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Belgium"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Belize"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Benin"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Bhutan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Bolivia"},{"TAG":"-LRB-"},{"TEXT":"Plurinational"},{"TEXT":"State"},{"TEXT":"of"},{"TAG":"-RRB-"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Bosnia"},{"TEXT":"and"},{"TEXT":"Herzegovina"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Botswana"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Brazil"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Brunei"},{"TEXT":"Darussalam"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Bulgaria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Burkina"},{"TEXT":"Faso"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Burundi"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Cabo"},{"TEXT":"Verde"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Cambodia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Cameroon"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Canada"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Central"},{"TEXT":"African"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Chad"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Chile"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"China"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Colombia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Comoros"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Congo"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Cook"},{"TEXT":"Islands"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Costa"},{"TEXT":"Rica"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Côte"},{"TEXT": {"REGEX": r"d’Ivoire"}}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Croatia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Cuba"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Cyprus"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Czechia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Czech"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Democratic"},{"TEXT":"People"},{},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Korea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Democratic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Congo"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Denmark"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Djibouti"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Dominica"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Dominican"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Ecuador"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Egypt"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"El"},{"TEXT":"Salvador"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Equatorial"},{"TEXT":"Guinea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Eritrea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Estonia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Eswatini"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Ethiopia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Fiji"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Finland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"France"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Gabon"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Gambia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Georgia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Germany"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Ghana"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Greece"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Grenada"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Guatemala"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Guinea-Bissau"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Guinea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Guyana"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Haiti"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Holy"},{"TEXT":"See"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Honduras"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Hungary"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Iceland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"India"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Indonesia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Iran"},{"TAG":"-LRB-"},{"TEXT":"Islamic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TAG":"-RRB-"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Iraq"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Ireland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Israel"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Italy"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Jamaica"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Japan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Jordan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Kazakhstan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Kenya"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Kiribati"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Kuwait"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Kyrgyzstan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Lao"},{"TEXT":"People"},{},{"TEXT":"Democratic"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Latvia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Lebanon"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Lesotho"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Liberia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Libya"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Liechtenstein"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Lithuania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Luxembourg"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Madagascar"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Malawi"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Malaysia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Maldives"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Mali"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Malta"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Marshall"},{"TEXT":"Islands"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Mauritania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Mauritius"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Mexico"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Micronesia"},{"TAG":"-LRB-"},{"TEXT":"Federated"},{"TEXT":"States"},{"TEXT":"of"},{"TAG":"-RRB-"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Monaco"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Mongolia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Montenegro"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Morocco"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Mozambique"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Myanmar"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Namibia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Nauru"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Nepal"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Netherlands"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"New"},{"TEXT":"Zealand"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Nicaragua"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Niger"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Nigeria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Niue"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"North"},{"TEXT":"Macedonia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Norway"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Oman"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Pakistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Palau"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Panama"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Papua"},{"TEXT":"New"},{"TEXT":"Guinea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Paraguay"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Peru"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Philippines"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Poland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Portugal"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Qatar"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Korea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Moldova"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Romania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Romania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Russian"},{"TEXT":"Federation"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Rwanda"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Saint"},{"TEXT":"Kitts"},{"TEXT":"and"},{"TEXT":"Nevis"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Saint"},{"TEXT":"Lucia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Saint"},{"TEXT":"Lucia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Saint"},{"TEXT":"Vincent"},{"TEXT":"and"},{"TEXT":"the"},{"TEXT":"Grenadines"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Samoa"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"San"},{"TEXT":"Marino"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Sao"},{"TEXT":"Tome"},{"TEXT":"and"},{"TEXT":"Principe"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Saudi"},{"TEXT":"Arabia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Senegal"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Serbia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Seychelles"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Sierra"},{"TEXT":"Leone"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Singapore"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Slovakia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Slovenia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Solomon"},{"TEXT":"Islands"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Somalia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"South"},{"TEXT":"Africa"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"South"},{"TEXT":"Sudan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Spain"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Sri"},{"TEXT":"Lanka"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Palestine"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Sudan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Suriname"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Sweden"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Switzerland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Syrian"},{"TEXT":"Arab"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Tajikistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Thailand"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Arab"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Egypt"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Argentine"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Bolivarian"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Venezuela"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Central"},{"TEXT":"African"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Co-operative"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Guyana"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Commonwealth"},{"TEXT":"of"},{"TEXT":"Dominica"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Commonwealth"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Bahamas"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Cook"},{"TEXT":"Islands"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Czech"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Democratic"},{"TEXT":"People"},{},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Korea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Democratic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Sao"},{"TEXT":"Tome"},{"TEXT":"and"},{"TEXT":"Principe"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Democratic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Congo"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Democratic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Timor-Leste"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Democratic"},{"TEXT":"Socialist"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Sri"},{"TEXT":"Lanka"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Dominican"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Eastern"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Uruguay"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Federal"},{"TEXT":"Democratic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Ethiopia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Federal"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Germany"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Federal"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Nigeria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Federal"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Somalia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Federated"},{"TEXT":"States"},{"TEXT":"of"},{"TEXT":"Micronesia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Federative"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Brazil"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"former"},{"TEXT":"Yugoslav"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Macedonia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"French"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Gabonese"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Grand"},{"TEXT":"Duchy"},{"TEXT":"of"},{"TEXT":"Luxembourg"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Hashemite"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Jordan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Hellenic"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Holy"},{"TEXT":"See"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Independent"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Papua"},{"TEXT":"New"},{"TEXT":"Guinea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Independent"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Samoa"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Islamic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Afghanistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Islamic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Iran"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Islamic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Mauritania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Islamic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Pakistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Bahrain"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Belgium"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Bhutan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Cambodia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Denmark"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Eswatini"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Lesotho"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Morocco"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Norway"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Saudi"},{"TEXT":"Arabia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Spain"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Sweden"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Thailand"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Netherlands"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Tonga"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Kyrgyz"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Lao"},{"TEXT":"People"},{},{"TEXT":"Democratic"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Lebanese"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"People"},{},{"TEXT":"Democratic"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Algeria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"People"},{},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Bangladesh"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"People"},{},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"China"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Plurinational"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Bolivia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Portuguese"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Principality"},{"TEXT":"of"},{"TEXT":"Andorra"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Principality"},{"TEXT":"of"},{"TEXT":"Liechtenstein"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Principality"},{"TEXT":"of"},{"TEXT":"Monaco"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Albania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Angola"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Armenia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Austria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Azerbaijan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Belarus"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Benin"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Botswana"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Bulgaria"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Burundi"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Cabo"},{"TEXT":"Verde"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Cameroon"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Chad"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Chile"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Colombia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Costa"},{"TEXT":"Rica"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Côte"},{"TEXT":"d'Ivoire"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Croatia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Cuba"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Cyprus"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Djibouti"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Ecuador"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"El"},{"TEXT":"Salvador"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Equatorial"},{"TEXT":"Guinea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Estonia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Fiji"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Finland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Ghana"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Guatemala"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Guinea-Bissau"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Guinea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Haiti"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Honduras"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Iceland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"India"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Indonesia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Iraq"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Italy"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Kazakhstan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Kenya"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Kiribati"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Korea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Latvia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Liberia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Lithuania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Madagascar"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Malawi"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Maldives"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Mali"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Malta"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Mauritius"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Moldova"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Mozambique"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Namibia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Nauru"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Nicaragua"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"North"},{"TEXT":"Macedonia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Palau"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Panama"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Paraguay"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Peru"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Poland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Rwanda"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"San"},{"TEXT":"Marino"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Senegal"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Serbia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Seychelles"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Sierra"},{"TEXT":"Leone"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Singapore"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Slovenia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"South"},{"TEXT":"Africa"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"South"},{"TEXT":"Sudan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Suriname"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Tajikistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Congo"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Gambia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Marshall"},{"TEXT":"Islands"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Niger"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Philippines"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Sudan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Union"},{"TEXT":"of"},{"TEXT":"Myanmar"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Trinidad"},{"TEXT":"and"},{"TEXT":"Tobago"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Tunisia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Turkey"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Uganda"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Uzbekistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Vanuatu"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Yemen"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Zambia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Zimbabwe"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Russian"},{"TEXT":"Federation"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Slovak"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Socialist"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Viet"},{"TEXT":"Nam"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Eritrea"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Israel"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Kuwait"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Libya"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Palestine"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"State"},{"TEXT":"of"},{"TEXT":"Qatar"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Sultanate"},{"TEXT":"of"},{"TEXT":"Oman"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Swiss"},{"TEXT":"Confederation"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Syrian"},{"TEXT":"Arab"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Togolese"},{"TEXT":"Republic"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"Union"},{"TEXT":"of"},{"TEXT":"the"},{"TEXT":"Comoros"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"United"},{"TEXT":"Arab"},{"TEXT":"Emirates"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"United"},{"TEXT":"Kingdom"},{"TEXT":"of"},{"TEXT":"Great"},{"TEXT":"Britain"},{"TEXT":"and"},{"TEXT":"Northern"},{"TEXT":"Ireland"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"United"},{"TEXT":"Mexican"},{"TEXT":"States"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"United"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Tanzania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the"},{"TEXT":"United"},{"TEXT":"States"},{"TEXT":"of", "OP": "?"},{"TEXT":"America", "OP": "?"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Timor-Leste"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Togo"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Tonga"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Trinidad"},{"TEXT":"and"},{"TEXT":"Tobago"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Tunisia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Turkey"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Turkmenistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Tuvalu"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Uganda"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Ukraine"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"United"},{"TEXT":"Arab"},{"TEXT":"Emirates"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"United"},{"TEXT":"Kingdom"},{"TEXT":"of", "OP": "?"},{"TEXT":"Great", "OP": "?"},{"TEXT":"Britain", "OP": "?"},{"TEXT":"and", "OP": "?"},{"TEXT":"Northern", "OP": "?"},{"TEXT":"Ireland", "OP": "?"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"United"},{"TEXT":"Republic"},{"TEXT":"of"},{"TEXT":"Tanzania"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"United"},{"TEXT":"States"},{"TEXT":"of"},{"TEXT":"America"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Uruguay"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Uzbekistan"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Vanuatu"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Venezuela"},{"TAG":"-LRB-"},{"TEXT":"Bolivarian"},{"TEXT":"Republic"},{"TEXT":"of"},{"TAG":"-RRB-"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Viet"},{"TEXT":"Nam"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Yemen"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Zambia"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"Zimbabwe"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the", "OP": "?"},{"TEXT":"European"},{"TEXT":"Union"}]},
            {"label": "UNPE", "pattern": [{"TEXT":"the", "OP": "?"},{"TEXT":"Group"},{"TEXT":"and"},{"TEXT":"77"},{"TEXT":"and"},{"TEXT":"China"}]},
            {"label": "POO", "pattern": [{"TEXT":"point"},{"TEXT":"of"},{"TEXT":"order"}]},
            {"label": "MS", "pattern": [{"TEXT":"suspended"}]},
            {"label": "MS", "pattern": [{"TEXT":"resumed"}]},
            {"label": "MS", "pattern": [{"TEXT":"rose"}]},
            {"label": "MS", "pattern": [{"TEXT":"called"},{"TEXT":"to"},{"TEXT":"order"}]},
            {"label": "Rule", "pattern": [{"TEXT":"rule"},{"LIKE_NUM":True}]},
            {"label": "Entity_MOD", "pattern": [{"TEXT":"on"},{"TEXT":"behalf"},{"TEXT":"of"}]},
            {"label": "AS", "pattern": [{"TEXT":"concluded"},{"TEXT":"its"},{"TEXT":"consideration"}]},
            {"label": "AS", "pattern": [{"TEXT":"concluded"},{"TEXT":"this"},{"TEXT":"stage"},{"TEXT":"of"},{"TEXT":"its"},{"TEXT":"consideration"}]},
            {"label": "AS", "pattern": [{"TEXT":"conclude"},{"TEXT":"its"},{"TEXT":"consideration"}]},
            {"label": "AS", "pattern": [{"TEXT":"conclude"},{"TEXT":"this"},{"TEXT":"stage"},{"TEXT":"of"},{"TEXT":"its"},{"TEXT":"consideration"}]},
            {"label": "Vote", "pattern": [{"TEXT":"intended"},{"TEXT":"to"},{"TEXT":"vote"},{"TEXT":"in"},{"TEXT":"favour"}]},
            {"label": "AS", "pattern": [{"TEXT":"intended"},{"TEXT":"to"},{"TEXT":"vote"},{"TEXT":"against"}]},
            {"label": "AS", "pattern": [{"TEXT":"intended"},{"TEXT":"to"},{"TEXT":"abstain"}]},
            ]

### Continue

In [None]:
ruler = EntityRuler(nlp)
nlp.add_pipe(ruler, name="unNER", after="tagger")
ruler.add_patterns(patterns)

# print(nlp.pipe_names)  # ['tagger', 'unNER', 'parser', 'ner']

## Apply the model

In [None]:
e = []
for t in tqdm(df['text']):          # run all sentences thru NER
  doc = nlp(t)
  f = []
  for ent in doc.ents:
    if ent.label_ in ('EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 
                      'MONEY', 'NORP', 'ORG', 'PERCENT', 'PRODUCT', 
                      'QUANTITY', 'WORK_OF_ART'):
      continue                      # skip entities with above mentioned labels
    f.append((ent.text, ent.label_))
  e.append(f)
df = df.assign(ent=e)               # expand df to include our new column

# Data preparation for MPM

In [None]:
# Lookdown y from x in df.

def pred(x,y=0,df=df):
    """Lookdown y rows from x in df for 'pred'."""
    return df.loc[x+y, 'pred']

def text(x,y=0,df=df):
    """Lookdown y rows from x in df for 'text'."""
    return df.loc[x+y, 'text']

def modu(x,y=0,col='text',df=df):
    """Lookdown y rows from x in df for str 'col'."""
    return df.loc[x+y,col]

# counters

mt_i = ms_i = v_i = 0 # meeting, meeting segment and vote counters (NOT reset for x)

for x in range(400): # len(df)):  
  i1 = i2 = i3 = i4 = i5 = i6 = i7 = i8 = i9 = 0 # search counters (reset for x)
  # Meeting code
  if modu(x,0,'line') == 0:
    while pred(x,i1) != 'MeetingNumber': # search until hit MeetingNumber
      i1+=1
    while pred(x,i2) != 'MeetingDateTime': # same 
      i2+=1    
    while pred(x,i3) != 'MeetingLocation': # same
      i3+=1    
    while pred(x,i4) != 'PresidingOfficer': # same
      i4+=1    
    while pred(x,i5) != 'Session': # same
      i5+=1    

  # Meeting segment code
  # if ms_i > 0:
  #  ms_i -= 1
  #  continue # if this is a subsequent MeetingSegment
  # while pred(x,ms_i) == 'MeetingSegment': # lookdown to MeetingSegment
  #  ms_i+=1 

  # Skip the sentences with the following labels
  if pred(x) in ('Organization', 'Body', 'Session', 
                 'MeetingNumber', 'MeetingDateTime', 'MeetingLocation', 
                 'RecordType', 'PresidingOfficer', 
                 'General', 'Election', 'Introduction', 'TemporaryPresidingOfficer'):
    continue

  if pred(x) == 'ActionObject':
    while pred(x,i6) != 'Action':
      i6+=1

  if pred(x) == 'Action':
    continue # skip print because data copied to ActionObject

  if v_i > 1:
    v_i -= 1
    continue # if this is a subsequent Vote

  if v_i == 1:
    v_i = 0 # not so elegant
  
  while pred(x,v_i) == 'Vote': # lookdown to Vote
    v_i+=1 

  if modu(x,0,'line') == 0:
    # mte = modu(x,i1,'ent') + modu(x,i2,'ent') + modu(x,i3,'ent') + modu(x,i4,'ent') + modu(x,i5,'ent')
    print('MT      ', text(x,i1), text(x,i2), text(x,i3), modu(x,i4,'ent')[0][0], text(x,i5))
                                                          # select PERSON and UNPE from PresidingOfficer
  
  #elif pred(x) == 'MeetingSegment':                                  del
  #  msl = [text(x,y).strip() for y in range (0,ms_i)]                del
  #  mse = [modu(x,y,'ent') for y in range (0,ms_i)]                  del
  #  print('   MTS  ', x, 'MeetingSegmentG', mse, '. '.join(msl))     del
  #  ms_i-=1                                                          del

  elif pred(x) == 'MeetingSegment': #                                 thanks to Pranav this is now a lot more straightforward :-)
    print('   MTS  ', x, text(x), modu(x,0,'ent'))
  
  elif pred(x) == 'ActionObject': # here we merge ActionObject and Action, by looking forward to the next action for each ActionObject
    print('       P', x, 'Object+Action', modu(x,0,'ent')+modu(x,i6,'ent'), text(x), text(x,i6))
  elif pred(x) == 'Vote':
    vl = [text(x,y).strip() for y in range (0,v_i)]
    ve = [modu(x,y,'ent') for y in range (0,v_i)]
    print('       P', x, 'VoteRecord', ve, '. '.join(vl)) # here we merge in favor, against and abstentions into a single line
  else:
    print('       P', x, pred(x), modu(x,0,'ent'), text(x))

MT       103rd plenary meeting Tuesday, 10 June 2008, 3 p.m. New York Kerim Sixty-second session
   MTS   1 Official Records []
       P 11 MeetingStatus [('called to order', 'MS'), ('3.05 p.m.', 'TIME')] The meeting was called to order at 3.05 p.m.
   MTS   14 Agenda item 44 (continued) Secretary-General for the comprehensive update on Implementation of the Declaration of Commitment national progress in implementing the 2001 on HIV/AIDS and the Political Declaration on Declaration of Commitment on HIV/AIDS and the 2006 Political Declaration (A/62/780). We would also like to thank the co-facilitators and the Joint United Report of the Secretary-General (A/62/780) Nations Programme on HIV/AIDS (UNAIDS) as the Note by the President of the General Assembly substantive secretariat for organizing this 2008 review. (A/62/CRP.1 and Corr.1) The report of the Secretary-General tells us that  [('Agenda item 44', 'AI'), ('(continued)', 'AS'), ('2001', 'DATE'), ('2006', 'DATE'), ('A/62/780', 'Symb

# Secondary

## Analysis and training

In [None]:
sns.set_theme(style="whitegrid")

ax = sns.boxplot(y="pred", x="prob",data=df)
ticks = ax.set_xticklabels(ax.get_xticklabels())
grid = ax.xaxis.grid(True)

In [None]:
from google.colab import data_table
data_table.DataTable(df,include_index=True, num_rows_per_page=10, max_rows=999999)

Unnamed: 0,file,line,prob,pred,text,noise,ent
0,A_62_PV-103-EN.txt,0,0.676788,MeetingSegment,United Nations,"{'Action': 0.0015051484806463122, 'ActionObjec...",[]
1,A_62_PV-103-EN.txt,1,0.489409,MeetingSegment,Official Records,"{'Action': 4.49472190666711e-06, 'ActionObject...",[]
2,A_62_PV-103-EN.txt,2,0.999501,Body,General Assembly,"{'Action': 1.8324375616884936e-07, 'ActionObje...",[]
3,A_62_PV-103-EN.txt,3,0.999985,Session,Sixty-second session,"{'Action': 5.885841005692782e-07, 'ActionObjec...",[]
4,A_62_PV-103-EN.txt,4,0.992600,MeetingNumber,103rd plenary meeting,"{'Action': 1.9521717575798903e-09, 'ActionObje...",[]
...,...,...,...,...,...,...,...
900,A_75_PV.7-EN.txt,117,1.000000,General,"Address by Mr. Laurentino Cortizo Cohen, Presi...","{'Action': 5.93354143596514e-10, 'ActionObject...","[(Laurentino Cortizo Cohen, PERSON), (the Repu..."
901,A_75_PV.7-EN.txt,118,0.996476,OralStatement,Annex XIV [Original: Mongolian; English transl...,"{'Action': 7.793380063958466e-06, 'ActionObjec...",[]
902,A_75_PV.7-EN.txt,119,1.000000,General,"Address by Mr. Battulga Khaltmaa, President of...","{'Action': 2.250736841347134e-08, 'ActionObjec...","[(Battulga Khaltmaa, PERSON), (Mongolia, UNPE)..."
903,A_75_PV.7-EN.txt,120,0.961701,Action,Annex XV,"{'Action': 0.9617013335227966, 'ActionObject':...",[]


In [None]:
s = [
     [{       }, 'Action'],
     [{  }, 'ActionObject'],
     [{      }, 'AgendaStatus'],
     [{         }, 'Clarification'],
     [{         }, 'Conclusion'],
     [{       }, 'Election'],
     [{ 1,178,237 }, 'General'],
     [{         }, 'Introduction'],
     [{     }, 'MeetingSegment'],
     [{         }, 'MeetingStatus'],
     [{    }, 'OralStatement'],
     [{    }, 'Sponsorship'],
     [{         }, 'Vote'],
     [{    }, 'VotingIntention'],
     
     ]  

gc = pygsheets.authorize(service_file='/content/drive/MyDrive/Intergov data project/drive.json')
sh = gc.open('Pipe - Training') 
sheet0 = sh[0]

text = []          
train = []
for g in s:
  for i in g[0]:
    t = df.at[i, 'text']
    if t and g[0]:
      sheet0.append_table(values=[g[1],t],dimension='ROWS',overwrite=False)

'''
Action, ActionObject, AgendaStatus, Body, Clarification, Conclusion, General, 
Introduction, MeetingDateTime, MeetingLocation, MeetingNumber, MeetingSegment
MeetingStatus, OralStatement, PresidingOfficer, RVRequested, RVTaken
Session, Sponsorship, TemporaryPresidingOfficer, Vote, VotingIntention
'''

'\nAction, ActionObject, AgendaStatus, Body, Clarification, Conclusion, General, \nIntroduction, MeetingDateTime, MeetingLocation, MeetingNumber, MeetingSegment\nMeetingStatus, OralStatement, PresidingOfficer, RVRequested, RVTaken\nSession, Sponsorship, TemporaryPresidingOfficer, Vote, VotingIntention\n'

## Storing predictions

In [None]:
# authorization
gc = pygsheets.authorize(service_file='/content/drive/MyDrive/Intergov data project/drive.json')
sh = gc.open('Pipe - Predictions') 
df_ = df.astype(str).apply(lambda x: x.str[:50000]) # gsheets don't allow for cells >50k chars... 
wks = sh[0] # first sheet
wks.set_dataframe(df_,(1,1)) # load df in sheet, will crash if >10k

In [None]:
with open('/content/drive/MyDrive/Intergov data project/w1.pkl', 'wb') as file:
  df.to_pickle(file)

import pandas as pd
unpickled_df = pd.read_pickle("/content/drive/MyDrive/Intergov data project/w1.pkl")

## File persistence

In [None]:
#@title  { form-width: "20%" }
way = "per2vol" #@param ["vol2per", "per2vol"]
import os
import shutil

from google.colab import drive
drive.mount('/content/drive')

volatile = '/content/'
persistent = '/content/drive/MyDrive/Intergov data project/persistent/'

if way =="vol2per":
  a = volatile
  b = persistent
else:
  a = persistent
  b = volatile

files = os.listdir(a)
if not os.path.exists(b):
    os.mkdir(b)
for f in files:
  p = os.path.join(a, f)
  d = os.path.join(b, f)
  if os.path.isfile(p):
        shutil.copy(p, d)


## NER playground

In [None]:
import spacy
from spacy import displacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.language import Language

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex


# Playground
nlp = spacy.load("en_core_web_sm")

# Fix infix slash

# Modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS)
        # r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), # to avoid splitting UN doc symbols, replaced with below
        r"(?<=[{a}0-9])[:<>=](?=[{a}])".format(a=ALPHA)
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer


ruler = EntityRuler(nlp)

ruler.add_patterns(patterns)
nlp.add_pipe(ruler, after="tagger")
# print(nlp.pipe_names)  # ['tagger', 'parser', 'ner', 'print_info']

doc = nlp(u'''

The Acting President: I shall now put to the vote operative paragraph 12.
A recorded vote was taken. 
In favour: Afghanistan, Albania, Andorra, Antigua and Barbuda, Argentina, Armenia, Australia, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belarus, Belgium, Belize, Benin, Bhutan, Bosnia and Herzegovina, Botswana, Brunei Darussalam, Bulgaria, Burundi, Cabo Verde, Cambodia, Canada, China, Colombia, Comoros, Côte d’Ivoire, Croatia, Cyprus, Czech Republic, Democratic Republic of the Congo, Denmark, Djibouti, Dominica, Dominican Republic, Equatorial Guinea, Estonia, Eswatini, Ethiopia, Fiji, Finland, Gabon, Georgia, Germany, Ghana, Greece, Guatemala, Guinea, Guinea- Bissau, Guyana, Haiti, Honduras, Hungary, Iceland, Indonesia, Iraq, Italy, Jamaica, Japan, Jordan, Kazakhstan, Kenya, Kuwait, Kyrgyzstan, Lao People’s Democratic Republic, Latvia, Lebanon, Lesotho, Liberia, Libya, Lithuania, Luxembourg, Madagascar, Malawi, Malaysia, Maldives, Mali, Malta, Marshall Islands, Mauritania, Mauritius, Micronesia (Federated States of), Monaco, Mongolia, Montenegro, Morocco, Mozambique, Myanmar, Namibia, Nepal, Netherlands, Nicaragua, Niger, Norway, Oman, Pakistan, Panama, Papua New Guinea, Paraguay, Peru, Poland, Portugal, Qatar, Republic of Korea, Republic of Moldova, Romania, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Samoa, Saudi Arabia, Senegal, Serbia, Seychelles, Sierra Leone, Singapore, Slovakia, Slovenia, Solomon Islands, South Sudan, Spain, Sri Lanka, Sudan, Suriname, Sweden, Tajikistan, the former Yugoslav Republic of Macedonia, Timor-Leste, Togo, Trinidad and Tobago, Tunisia, Turkey, Turkmenistan, Tuvalu, Uganda, Ukraine, United Arab Emirates, United Kingdom of Great Britain and Northern Ireland, United Republic of Tanzania, United States of America, Uruguay, Uzbekistan, Vanuatu, Viet Nam, Yemen, Zambia 
Against: France, South Africa 
Abstaining: Algeria, Angola, Austria, Bolivia (Plurinational State of), Brazil, Chile, Costa Rica, Cuba, Ecuador, Egypt, El Salvador, India, Iran (Islamic Republic of), Ireland, Israel, Liechtenstein, Mexico, New Zealand, Nigeria, Philippines, Russian Federation, San Marino, Sao Tome and Principe, Switzerland, Thailand, Venezuela (Bolivarian Republic of), Zimbabwe Operative paragraph 12 was retained by 148 votes to 2, with 27 abstentions. 
[Subsequently, the delegation of Sweden informed the Secretariat that it had intended to abstain.] 


 ''')

# x = 0
# y = 0
# c = []

# for ent in doc.ents:
#  if ent.label_ == "AI" or ent.label_ == "DD" or ent.label_ == "DR":
#    y+=1
#    x=0
#  else:
#    x+=1
#  c.append((x,y,ent.label_,ent.start,ent.end))

# print(c)
# data = pd.DataFrame.from_records(c, columns=['x', 'y', 'label', 'start', 'end'])

# xs = data[['x','y']]
# xs.head()


displacy.render(doc, style="ent", jupyter=True)
# displacy.render(doc, style="dep", jupyter=True, options={'distance':90})
# for token in doc:
     # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
       #     token.shape_, token.is_alpha, token.is_stop)
# print([(token.text) for token in doc])