# Preprocessing pipeline

In [1]:
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import numpy as np

Using TensorFlow backend.


In [2]:
with open('persons.txt', 'r') as f:
    text1 = f.read()

# Creating test dictionary

In [3]:
char2id = dict((c,i) for (i, c) in enumerate(sorted(set(text1.replace('\n', ' '))),1))
id2char = dict(enumerate(sorted(set(text1.replace('\n', ' '))),1))

char2id['<PAD>'] = 0
id2char[0] = '<PAD>'

print(char2id)

{' ': 1, "'": 2, '-': 3, '.': 4, 'A': 5, 'B': 6, 'C': 7, 'D': 8, 'E': 9, 'F': 10, 'G': 11, 'H': 12, 'I': 13, 'J': 14, 'K': 15, 'L': 16, 'M': 17, 'N': 18, 'O': 19, 'P': 20, 'Q': 21, 'R': 22, 'S': 23, 'T': 24, 'U': 25, 'V': 26, 'W': 27, 'X': 28, 'Y': 29, 'Z': 30, 'a': 31, 'b': 32, 'c': 33, 'd': 34, 'e': 35, 'f': 36, 'g': 37, 'h': 38, 'i': 39, 'j': 40, 'k': 41, 'l': 42, 'm': 43, 'n': 44, 'o': 45, 'p': 46, 'q': 47, 'r': 48, 's': 49, 't': 50, 'u': 51, 'v': 52, 'w': 53, 'x': 54, 'y': 55, 'z': 56, '<PAD>': 0}


# Analyzing the data to clean 

In [4]:
person_names = list(set(text1.split('\n')))
person_names[:5]

['', 'Terry Bross', 'Alieu Touray-Saidy', 'Albert Prince-Cox', 'Roy Vagelos']

# Cleaning and preprocessing person data

In [5]:
cleaned_person_names = []
for name in person_names:
    if len(name) < 40 and len(name) > 3 and not name.startswith('The '):
        cleaned_person_names.append(name)


print('Lenght of raw names ', len(person_names))        
print('Lenght of cleaned person names ', len(cleaned_person_names))
cleaned_person_names[:5]

Lenght of raw names  645406
Lenght of cleaned person names  641176


['Terry Bross',
 'Alieu Touray-Saidy',
 'Albert Prince-Cox',
 'Roy Vagelos',
 'Laura de la Torre Tur']

In [6]:
nameids = [[char2id[char] for char in name] for name in cleaned_person_names]

print(nameids[:5])

[[24, 35, 48, 48, 55, 1, 6, 48, 45, 49, 49], [5, 42, 39, 35, 51, 1, 24, 45, 51, 48, 31, 55, 3, 23, 31, 39, 34, 55], [5, 42, 32, 35, 48, 50, 1, 20, 48, 39, 44, 33, 35, 3, 7, 45, 54], [22, 45, 55, 1, 26, 31, 37, 35, 42, 45, 49], [16, 31, 51, 48, 31, 1, 34, 35, 1, 42, 31, 1, 24, 45, 48, 48, 35, 1, 24, 51, 48]]


In [7]:
with open('entities.txt', 'r') as f:
    text2 = f.read()

In [8]:
entity_names = list(set(text2.split('\n')))
entity_names[:5]

['Historical-ethnographic museum of Khinalug village',
 '',
 'S.V.VESTA',
 'Saint Lucy',
 'Mastaangi']

# Cleaning and preprocessing entity data 

In [9]:
cleaned_entity_names = []
for name in entity_names:
    if len(name) < 40 and len(name) > 3:
        cleaned_entity_names.append(name)

print('Lenght of raw names ', len(entity_names))        
print('Lenght of cleaned person names ', len(cleaned_entity_names))
cleaned_entity_names[:5]

Lenght of raw names  1189452
Lenght of cleaned person names  1162501


['S.V.VESTA', 'Saint Lucy', 'Mastaangi', 'INS Bimlipatan', 'Tarucus legrasi']

In [10]:
entityids = [[char2id[char] for char in entity] for entity in cleaned_entity_names]
print(entityids[:5])

[[23, 4, 26, 4, 26, 9, 23, 24, 5], [23, 31, 39, 44, 50, 1, 16, 51, 33, 55], [17, 31, 49, 50, 31, 31, 44, 37, 39], [13, 18, 23, 1, 6, 39, 43, 42, 39, 46, 31, 50, 31, 44], [24, 31, 48, 51, 33, 51, 49, 1, 42, 35, 37, 48, 31, 49, 39]]


# Adding labels

In [11]:
labels = np.concatenate([np.ones((len(nameids),)), np.zeros((len(entityids),))])
print(labels[:10])

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [12]:
len(labels)

1803677

In [13]:
inputs = np.concatenate([nameids,entityids])
print(inputs[:5])

[list([24, 35, 48, 48, 55, 1, 6, 48, 45, 49, 49])
 list([5, 42, 39, 35, 51, 1, 24, 45, 51, 48, 31, 55, 3, 23, 31, 39, 34, 55])
 list([5, 42, 32, 35, 48, 50, 1, 20, 48, 39, 44, 33, 35, 3, 7, 45, 54])
 list([22, 45, 55, 1, 26, 31, 37, 35, 42, 45, 49])
 list([16, 31, 51, 48, 31, 1, 34, 35, 1, 42, 31, 1, 24, 45, 48, 48, 35, 1, 24, 51, 48])]


In [14]:
len(inputs)

1803677

In [15]:
len(inputs) == len(labels)

True

# Saving inputs and labels 

In [16]:
inputs_file = open('cleaned_inputs.txt', 'w', encoding='utf-8')
labels_file = open('labels.txt', 'w', encoding='utf-8')

for text in np.concatenate([cleaned_person_names,cleaned_entity_names]):
    inputs_file.write(text+'\n')
    
for label in labels:
    labels_file.write(str(label)+'\n')
    
inputs_file.close()
labels_file.close()