In [9]:
import csv
import nltk
from time import time

"""
Part 1: 
    Objective: Read dataset and create a dictionary for all entity categories with their frequency.

    Output:
        sorted_categories = {
            'person_name'   : 1000,
            'location_name' : 1234
        }
"""

t = time()
with open("data/data_origin.DUMP", encoding="utf8") as tsv:
    raw_categories = {}
    raw_category_examples = {}
    for line in csv.reader(tsv, dialect="excel-tab"):
        tokenized = line[1]
        sentence = line[2]
        
        all_words = nltk.word_tokenize(sentence)
        all_tokens = nltk.word_tokenize(tokenized)

        for x in range(len(all_words)):
            try:
            
                token_raw = all_tokens[x]
                # Ignore 0's there are a lot of them
                if token_raw is 'O':
                    continue
                
                if token_raw.startswith('B-') or token_raw.startswith('I-'):
                    token = token_raw.replace('B-', '').replace('I-', '')
                    if token in raw_categories:
                        raw_categories[token] = raw_categories[token] + 1
                        if len(raw_category_examples[token]) < 15:
                            word = all_words[x]
                            raw_category_examples[token].append(word)
                            
                    else:
                        raw_categories[token] = 1
                        raw_category_examples[token] = [word]
            
            except IndexError:
                pass
              

    
print('Time to create dictionary: {} mins'.format(round((time() - t) / 60, 4)))

Time to create dictionary: 6.16 mins


In [13]:
import pandas as pd
df = pd.DataFrame()

sorted_categories = {k: v for k, v in sorted(raw_categories.items(), reverse=True, key=lambda item: item[1])}

for token in sorted_categories:
    df = df.append({
            'Token': token,
            'Count': raw_categories[token],
            'Examples': ", ".join(raw_category_examples[token])
        }, ignore_index=True)
                
    
df.head(5)

Unnamed: 0,Count,Examples,Token
0,141205.0,"X'in, Hill, Ralf, Rudolf, Möller, Moeller, Moe...",person_name
1,89304.0,"Fransızca, Pakistan, Pakistan, Pakistan'ın, Te...",location_containedby
2,59708.0,"Casanova, İsviçre'nin, Amerika, Birleşik, Devl...",person_nationality
3,58080.0,"Mahkemesi, sunucu, oyunun, oyuncu, fizikçi, ki...",person_profession
4,43630.0,"Peştuca, Denton, Cisco, Cisco'da, Sucre, Sucre...",citytown_name


In [15]:
writer = pd.ExcelWriter('data/labeled_categories_with_examples.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index=False)

# Close the Pandas Excel writer and output the Excel file.
writer.save()
