In [1]:
import pandas as pd
import numpy as np

from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import word_tokenize

In [2]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [3]:
def lemmatize_sentence(sentence):
    nltk_tagged = pos_tag(sentence.split())
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(WordNetLemmatizer().lemmatize(word, tag))
    return lemmatized_sentence

In [4]:
text_1 = 'Alvin Thomas and Cheryl Alvin walk their dog along River Yangtze.'
text_2 = 'Alvin Thomas has a dog when he was a child.'
text_3 = 'Alvin and Cheryl like their dog, Danielle Cheryl.'
text_4 = 'Alvin Thomas, Cheryl Alvin and Danielle live in Cave Yangtze.'
text_5 = 'Dog Danielle likes walking along River Yangtze.'
text_6 = 'Liking his name, dog Danielle walks along Cave Yangtze.'
text_7 = 'Alvin and Cheryl like having a child.'

In [5]:
test_1 = "Alvin and Cheryl have a child, Thomas Cheryl."

In [6]:
list_1 = [('Alvin','B'),('Thomas','I'),('and','O'),('Cheryl','B'),('Alvin','I'),('walk','O'),
          ('their','O'),('dog','O'),('along','O'),('River','B'),('Yangtze','I'),('.','O')]
list_2 = [('Alvin','B'),('Thomas','I'),('has','O'),('a','O'),('dog','O'),('when','O'),
          ('he','O'),('was','O'),('a','O'),('child','O'),('.','O')]
list_3 = [('Alvin','B'),('and','O'),('Cheryl','B'),('like','O'),('their','O'),('dog','O'),(',','O'),
          ('Danielle','B'),('Cheryl','I'),('.','O')]
list_4 = [('Alvin','B'),('Thomas','I'),(',','O'),('Cheryl','B'),('Alvin','I'),('and','O'),('Danielle','B'),('live','O'),
         ('in','O'),('Cave','B'),('Yangtze','I'),('.','O')]
list_5 = [('Dog','O'),('Danielle','B'),('likes','O'),('walking','O'),('along','O'),('River','B'),
          ('Yangtze','I'),('.','O')]
list_6 = [('Liking','O'),('his','O'),('name','O'),(',','O'),('dog','O'),('Danielle','B'),('walks','O'),
          ('along','O'),('Cave','B'),('Yangtze','I'),('.','O')]
list_7 = [('Alvin','B'),('and','O'),('Cheryl','B'),('like','O'),('having','O'),('a','O'),('child','O'),('.','O')]

In [7]:
all_list = []
for i in range(1,8):
    list_object = 'list'+'_'+str(i)
    all_list += eval(list_object) 
    
tokens, entity = list(zip(*all_list))
tokens = [lemmatize_sentence(token.lower())[0] for token in tokens]

df = pd.DataFrame(tokens,entity).reset_index().rename(columns={0:'tokens','index':'entity'})
df['values']=1
df = df.pivot_table(index='entity',columns='tokens',aggfunc='sum').fillna(0)
df = df.div(df.sum(axis=1), axis=0)
df.columns = df.columns.droplevel(0)

df

tokens,",",.,a,along,alvin,and,be,cave,cheryl,child,...,in,like,live,name,river,their,thomas,walk,when,yangtze
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B,0.0,0.0,0.0,0.0,0.294118,0.0,0.0,0.117647,0.235294,0.0,...,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0
I,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.4
O,0.066667,0.155556,0.066667,0.066667,0.0,0.088889,0.022222,0.0,0.0,0.044444,...,0.022222,0.088889,0.022222,0.022222,0.0,0.044444,0.0,0.066667,0.022222,0.0


In [8]:
df2 = []

for i in range(1,8):
    list_object = eval('list'+'_'+str(i))
    for index in range(len(list_object)-1):
        first_token = list_object[index][1]
        next_token = list_object[index+1][1]
        df2.append({'first_token':first_token,'next_token':next_token})
        
df2 = pd.DataFrame(df2)
    

In [9]:
df2['values'] = 1
df2 = df2.pivot_table(index='first_token',columns='next_token',aggfunc='sum').fillna(0)
df2 = df2.div(df2.sum(axis=1), axis=0)

df2.columns = df2.columns.droplevel(0)

In [10]:
df2 = pd.concat([pd.DataFrame(columns=['B','I','O'],index=['<s>'],data=[[5/7,0,2/7]]),df2],axis=0)

In [11]:
df2

Unnamed: 0,B,I,O
<s>,0.714286,0.0,0.285714
B,0.0,0.588235,0.411765
I,0.0,0.0,1.0
O,0.315789,0.0,0.684211


In [12]:
list_of_entity = []
previous = '<s>'
tokens = [lemmatize_sentence(token.lower())[0] for token in word_tokenize(test_1)]

for token in tokens:
    
    matrix = df[token].multiply(df2.loc[previous])
    previous = df.index[np.argmax(matrix)]
    token_label_pair = (token,previous)
    list_of_entity.append(token_label_pair)

In [13]:
list_of_entity

[('alvin', 'B'),
 ('and', 'O'),
 ('cheryl', 'B'),
 ('have', 'O'),
 ('a', 'O'),
 ('child', 'O'),
 (',', 'O'),
 ('thomas', 'B'),
 ('cheryl', 'I'),
 ('.', 'O')]

In [14]:
np.unique(tokens)

array([',', '.', 'a', 'alvin', 'and', 'cheryl', 'child', 'have', 'thomas'],
      dtype='<U6')

In [15]:
len(np.unique(tokens))

9