# g1_generated_dictionary
Create a dictionary to be able to match the entities in the original dataset and the entities generated.


In [1]:
import urllib.request
import csv
import Levenshtein
from difflib import SequenceMatcher
import country_converter as coco
import numpy as np
import logging

In [2]:
logging.disable(logging.WARNING)

In [3]:
def extract_statement_count():
    """Extract from the original dataset all the interventions."""
    f = open('Text/statements_count.csv')
    dataset = csv.reader(f)
    lines = []
    for d in dataset:
        if(len(d) != 0):
            line = d[0].split('\t')
            lines.append((line[0],line[2],line[1]))
    return lines

In [4]:
def extract_entities():
    """Extract from the original dataset all parties or party grouping."""
    dataset = extract_statement_count()
    entities = set()
    for d in dataset:
        entities.add(d[0])

    return list(entities)

In [5]:
entities_original = extract_entities()
entities_generated = [p.replace('\n','') for p in list(open("Text/entities_clean.txt"))]

### extract_[] are all function to map the entities in the original data to the one generated

In [6]:
def extract_initial(entities_gen, dict_e):
    """Function that find match with initials. """
    entities_original = extract_entities()
    for eg in entities_gen:
        for e in entities_original:
            d = Levenshtein.ratio(eg,e.upper())
            if(d > 0.9):
                dict_e[e].append(eg)
    
    return dict_e


In [7]:
def extract_title(entities_gen, dict_e):
    """Function that find match with title case. """
    entities_original = extract_entities()
    for eg in entities_gen:
        for e in entities_original:
            d = Levenshtein.ratio(eg.title(),e.title())
            if(d > 0.9):
                dict_e[e].append(eg)

    return dict_e

In [8]:
def extract_normal(entities_gen, dict_e):
    """Function that find match with lower case. """
    entities_original = extract_entities()
    for eg in entities_gen:
        for e in entities_original:
            d1 = Levenshtein.ratio(eg.lower(),e.lower())
            if(d1 > 0.9):
                dict_e[e].append(eg)
    return dict_e

In [9]:
def extract_g77china(entities_gen, dict_e):
    """Function that find match all different spellings of the G77. """
    entities_original = extract_entities()
    entities = []
    for eg in entities_gen:
        s = 'G77'
        if(eg == 'G-77/CHINA' or eg =='G-77 AND CHINA'):
            dict_e[s].append(eg)
        if(eg == 'G-77/ CHINA'):
            dict_e[s].append(eg)
        if(eg == 'said the G-77/China'):
            dict_e[s].append(eg)
        if(eg == 'for the G-77/China' or eg == 'for the G-77/ China' or eg == 'for the Group of 77 and China'):
            dict_e[s].append(eg)
        egs = eg[:3]
        d1 = Levenshtein.ratio(s,egs)
        if(d1 > 0.95):
            dict_e[s].append(eg)

    return dict_e

In [10]:
def extract_ldc(entities_gen, dict_e):
    """Function that find match all different spellings of the LDCs. """
    entities_original = extract_entities()
    entities = []
    for eg in entities_gen:
        s = 'LEAST DEVELOPED COUNTRIES'
        if(eg == 'for the LDCs' or eg == 'for the Least Developed Countries' or eg == 'for LDCs' or eg == 'LDC GROUP'):
            dict_e['LDCs'].append(eg)
        d1 = Levenshtein.ratio(s,eg)
        if(d1 > 0.95):
            dict_e['LDCs'].append(eg)

    return dict_e

In [11]:
def extract_aosis(entities_gen, dict_e):
    """Function that find match all different spellings of the AOSIS. """
    entities_original = extract_entities()
    entities = []
    for eg in entities_gen:
        if(eg == 'ALLIANCE OF SMALL ISLAND STATES'):
            dict_e['AOSIS'].append(eg)
    return dict_e

In [12]:
def extract_sadc(entities_gen, dict_e):
    """Function that find match all different spellings of the SADS. """
    entities_original = extract_entities()
    entities = []
    for eg in entities_gen:
        s = 'SADC'
        d1 = Levenshtein.ratio(s,eg)
        if(d1 > 0.95):
            dict_e['Southern Africa Development Community'].append(eg)

    return dict_e

In [13]:
def extract_eig(entities_gen, dict_e):
    """Function that find match all different spellings of the EIG. """
    entities_original = extract_entities()
    entities = []
    for eg in entities_gen:
        if(eg == 'for the Environmental Integrity Group'):
            dict_e['Environmental Integrity Group'].append(eg)
        s = 'EIG'
        d1 = Levenshtein.ratio(s,eg)
        if(d1 > 0.95):
            dict_e['Environmental Integrity Group'].append(eg)

    return dict_e

In [14]:
def remaining_original(dict_entities):
    """Find all the entities that still not have a match."""
    list_remain_original = []
    for e in dict_entities:
        if(len(dict_entities[e]) == 0 or e == 'Russian Federation'):
           list_remain_original.append(e)
           
    return list_remain_original


In [15]:
def create_dict(entities_original):
    """Create a dictionary for all the entities into the original dataset."""
    dict_e = {}
    for e in entities_original:
        dict_e[e] = []
    dict_e['Southern African Development Community'] = []
    return dict_e

In [16]:
def matching(entities_generated, remain_original, dict_entities):
    """Use the library contry-convertor to match the remaining entities of the originla dataset"""
    master_list = entities_generated
    match_these = remain_original
    matching_dict = coco.match(match_these, master_list)
    for e in matching_dict:
        if(matching_dict[e] !='not_found'):

            if(type(matching_dict[e]) ==list):
                dict_entities[e] += matching_dict[e]
            else:
                dict_entities[e].append(matching_dict[e])
                
    return dict_entities


In [17]:
def create_initials(entities_generated, dict_entities):
    """Function that find match the inital by creating them. """
    list_remain_original = []
    u = ''
    for e in dict_entities:
        initials = u.join([x[0].upper() for x in e.split(' ')])
        for eg in entities_generated:
            d1 = Levenshtein.ratio(initials,eg)
            if(d1 > 0.90):
                dict_entities[e].append(eg)

    return dict_entities

In [18]:
def find_groups(entities_generated, dict_entities):
    """Function that find match groups. """
    list_remain_original = []
    u = ''
    for e in dict_entities:
        initials = u.join([x[0].upper() for x in e.split(' ')])
        for eg in entities_generated:
            eg_g = eg.replace('GROUP','')
            d1 = Levenshtein.ratio(initials,eg_g)
            if(d1 > 0.95):
                dict_entities[e].append(eg)

    return dict_entities

In [19]:
def compute_last_chance(dict_entities):
    """Try to match the entities remaining with words with ratio > 0.8"""
    entities_generated = [p.replace('\n','') for p in list(open("Text/entities_clean.txt"))]
    remain_original = []
    for e in dict_entities:
        if(len(dict_entities[e]) == 0):
            remain_original.append(e)

    for er in remain_original:
        for e in entities_generated:
            d = Levenshtein.ratio(er.upper(),e)
            if(d > 0.8):

                dict_entities[er].append(e)

    return dict_entities

In [20]:
def add_special_cases(dict_entities):
    """Special cases added by hand. """
    dict_entities['EU'].append('EUROPEAN UNION')
    dict_entities['Syrian Arab Republic'].append('SYRIA')
    dict_entities['United States'].append('US')
    dict_entities['United Arab Emirates'].append('ARAB EMIRATES')
    dict_entities['Umbrella Group'].append('for the Umbrella Group')
    dict_entities['EU'].append('for the European Union')
    dict_entities['Arab Group'].append('for the Arab Group')
    dict_entities['African Group'].append('for the Africa Group')
    dict_entities['Arab Group'].append('for the League of Arab States')
    dict_entities['Like Minded Developing Countries'].append('for the Like Minded Developing Countries')
    dict_entities['Group of 9'].append('GROUP OF NINE')
    dict_entities['Switzerland'].append('SWITZER')
    dict_entities['Like Minded Developing Countries'].append('LMDC')
    dict_entities['Like Minded Developing Countries'].append('LMDCS')
    dict_entities['Central Group Eleven'].append('CG-11')
    dict_entities['Micronesia'].append('FEDERATED STATES OF MICRONESIA')
    dict_entities['Arab Group'].append('ARAB STATES')
    dict_entities['Congo, Republic'].append('THE DEMOCRATIC REPUBLIC OF THE CONGO')
    dict_entities['Belgium'].append('BELGUIM')
    dict_entities['St. Vincent and the Grenadines'].append('SAINT VINCENT AND THE GRENADINES')
    dict_entities['Uruguay'].append('URAGUAY')
    dict_entities['Central America'].append('CENTRAL AMERICAN STATES')
    dict_entities['Central America'].append('CENTRAL AMERICAN GROUP')
    dict_entities['Southern African Development Community'].append('SADC')
    return dict_entities


In [21]:
def compute_dictionary():
    """Generated the dictionary between the original entities and the one generated"""
    dict_entities = create_dict(entities_original)
    dict_entities = extract_initial(entities_generated, dict_entities)
    dict_entities = extract_title(entities_generated, dict_entities)
    dict_entities = extract_normal(entities_generated, dict_entities)
    dict_entities = extract_g77china(entities_generated, dict_entities)
    dict_entities = extract_sadc(entities_generated, dict_entities)
    dict_entities = extract_aosis(entities_generated, dict_entities)
    dict_entities = extract_eig(entities_generated, dict_entities)
    dict_entities = extract_ldc(entities_generated, dict_entities)
    dict_entities = create_initials(entities_generated, dict_entities)
    dict_entities = find_groups(entities_generated, dict_entities)
    remaining_orig = remaining_original(dict_entities)
    dict_entities = matching(entities_generated, remaining_orig , dict_entities)
    dict_entities = add_special_cases(dict_entities)

    dictionary = compute_last_chance(dict_entities)


    for d in dictionary:
        dictionary[d] = list(set(dictionary[d]))

    return dictionary