In [3]:
import csv
import string
import nltk
import pprint
import pandas as pd
from time import time

"""
Part 2: 
    Objective: 
    
    Step 1: Read labeled tokens and group them by defined categories.

        Input:
            |     Token       |    Category   |
            |-----------------|---------------|
            |  B-person_name  |     Person    |
            |-----------------|---------------|
            | B-location_name |    Location   |
            |-----------------|---------------|
            |   B-film_actor  |    Person     |


    Output:
    
        Objective: This list will be used for calculating centroid for categories
        entity_categories = {
            'Person'   : { 'person_name', 'film_actor' },
            'Location' : { 'location_name' }
        }
        
        Objective: This list will be used for labeling words faster. 
        Brief    : The reason for this list is that when we're labeling
        words, in order to find label we have to 'find' it in entity_categories. With this list in hand we can
        find the category without looping through entity_categories.
        entity_category_dictionary = {
            'person_name' : 'Person',
            'location_name' : 'Location',        
        }
        
        
    Step 2: Read dataset and label words with their corresponding categories.
        
        Labeling action:
            
            Corina        |  B-politician_name     ----|  Person
	        Casanova      |  I-politician_name     ----|
	        ,             |  O
	        İsviçre       |  B-person_nationality  ----| Nationality
	        Federal       |  O
	        Şansölyesidir |  B-governmental_jurisdiction_basic_title  | PersonType
	        .             |  O

        Output:
            labeled_words = {
                'Corina Casanova': 'Person',
                'İsviçre': 'Nationality',
                'Şansölyesidir' : 'PersonType',
                ..
            }

"""

Time to create entity categories: 0.0115 mins


In [7]:
#### Helpers ####
 
# Removes token identifiers from string.
def remove_token_identifiers(raw_token):
    return raw_token.replace('B-', '').replace('I-', '')

# Checks whether token is inner.
def is_inner_token(raw_token):
    return raw_token.find('I-') != -1

In [None]:
#### STEP 1 ####

t = time()
my_printer = pprint.PrettyPrinter()

# Read file
df = pd.read_excel('data/labeled_categories.xlsx')

# Fallback category for the tokens does not belong to chosen categories
fallback_category = 'Thing'
# Main category list
category_list = [
    'Person', 'Location', 'Event', 'Organization', 'DateTime', 'PersonType', 'Currency', 'Nationality', 'Ethnicity'
]
# Initialize category-entity dictionary
entity_categories = {fallback_category : set({})}
entity_category_dictionary = {}

for category in category_list:
    entity_categories[category] = set({})

# Read Excel file and fill the dictionary
for row in df.index:
    token = remove_token_identifiers(df['Token'][row])
    token = token[:token.find(' =')]
    category = df['Category'][row]
    
    # Fallback category check
    if category == '' or category == 'Delete' or category == 'delete' or category == 'Thing' or category == 'thing':
        entity_categories[fallback_category].add(token)
        entity_category_dictionary[token] = fallback_category
        continue
    
    # Undefined category check
    if category not in category_list:
        continue
    
    # Valid category
    entity_categories[category].add(token)
    entity_category_dictionary[token] = category
                
print('Time to create entity categories: {} mins'.format(round((time() - t) / 60, 4)))

In [4]:
#print(entity_categories)
#print(entity_category_dictionary['person_name'])
# my_printer.pprint(entity_categories['Location'])

Person


In [20]:
 #### STEP 2 ####

t = time()

# Read dataset
with open("data/data_origin.DUMP", encoding="utf8") as tsv:
    
    labeled_words = {}
    counter = 0
    for line in csv.reader(tsv, dialect="excel-tab"):
        tokenized = line[1]
        sentence = line[2]
        
        all_words = nltk.word_tokenize(sentence)
        all_tokens = nltk.word_tokenize(tokenized)
        
        sentence_valid_tokens = []
        
        for x in range(len(all_words)):
            try:
                token_raw = all_tokens[x]
                
                if token_raw == 'O':
                    continue
                
                token = remove_token_identifiers(token_raw)
                    
                if token not in entity_category_dictionary:
                    continue
                
                # We skip inner tokens, they are handled with their begin tokens
                if is_inner_token(token_raw):
                    continue
                    
                # If we reach this point it means active token is a begin token.
                word = all_words[x]
                label = entity_category_dictionary[token]
                
                # Concat the next token with begin token, if it's an inner token
                next_index = x + 1
                next_raw_token = all_tokens[next_index]
                while is_inner_token(next_raw_token):
                    next_word = all_words[next_index]
                    word = word + ' ' + next_word
                    next_index += 1
                    next_raw_token = all_tokens[next_index]
                
                print(word + ' : ' + label)
            
            except IndexError:
                pass
            
        counter += 1
        if counter == 1000:
            break

print('Time to label the entities: {} mins'.format(round((time() - t) / 60, 4)))

Corina Casanova : Person
İsviçre : Nationality
Şansölyesidir : PersonType
Casanova : Person
İsviçre Federal Yüksek Mahkemesi : Thing
avukat : PersonType
Corina Casanova : Person
federal : Organization
Casanova : Person
Almanca : Thing
Fransızca : Thing
İtalyanca : Location
İspanyolca : Thing
İsviçre Dışişleri Bakanlığı : Organization
İsviçre : Organization
federal : Organization
İsviçre'nin : Nationality
Gilgit Baltistan : Organization
Pakistan : Location
Urduca : Thing
Gilgit : Organization
Pakistan : Location
Pakistan'ın : Location
72496 : Location
Gilgit'dir : Organization
Urduca : Thing
Peştuca : Thing
Wakhi : Thing
Denton : Location
Amerika Birleşik Devletleri'nde : Nationality
Teksas : Location
Denton : Location
Oklahoma : Location
Teksas : Location
Teksas'ın : Location
Teksas : Location
Teksas : Location
Apple'ın : Organization
sunucu : PersonType
Schiller : Person
2009'da : Thing
Mac OS X v10.5 : Thing
kullanıcı : Thing
Mac OS : Thing
Apple : Organization
Intel : Organization
P