In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
 
sentenceText = []
sentenceTag = []
sentences = []
tags = []

with open ('UPC-2016.txt','rt') as lines:
    for line in lines:
            splitLine = [i for i in (line.split('\t'))]
            if(len(splitLine) == 1):
                sentences.append(sentenceText)
                tags.append(sentenceTag)
                sentenceText = []
                sentenceTag = []
            else:
                sentenceText.append(splitLine[0])
                sentenceTag.append((splitLine[1].split('\n')[0]))

In [2]:
list_of_tuples = list(zip(sentences, tags))
df = pd.DataFrame(list_of_tuples,columns=['sentence', 'tags'])
df.head()

Unnamed: 0,sentence,tags
0,"[#, اولین, سیاره, سیاره, خارج, از, منظومه, شمس...","[DELM, ADJ_SUP, N_SING, N_SING, P, P, N_SING, ..."
1,"[طی, سالهای, اخیر, ،, بیش, از, دو, دوجین, سیار...","[P, N_PL, ADJ, DELM, ADJ, P, NUM, N_SING, N_SI..."
2,"[لیکن, ستاره‌شناسان, هرگز, این, سیاره‌ها, را, ...","[CON, N_PL, ADV_NEG, DET, N_PL, CLITIC, ADV, V..."
3,"[چنین, تصور, می‌شد, که, این, لرزش, به, علت, جا...","[ADV, N_SING, V_PA, CON, DET, N_SING, P, N_SIN..."
4,"[در, آخرین, تحقیق, به, عمل, آمده, نیز, ستاره‌ش...","[P, ADJ_SUP, N_SING, P, N_SING, ADJ_INO, CON, ..."


In [3]:
import random

dfIndexs = list(range(len(df)))
random.shuffle(dfIndexs)

testRate = int(len(df) * 0.1)
test_df = df.iloc[dfIndexs[:testRate]]
train_df = df.iloc[dfIndexs[testRate:]]

In [4]:
tagSet = set()
vocab = set()

for i in range(len(train_df)):
    tagSet.update(train_df.iloc[i]['tags'])
    vocab.update(train_df.iloc[i]['sentence']) #bulk insert

tagSet.remove('')
vocab.remove('')

In [5]:
def flatten_nested_list(listt):
    return [item for sublist in listt for item in sublist]

In [6]:
train_data_tag_list = flatten_nested_list(list(train_df['tags']) )
train_data_word_list = flatten_nested_list(list(train_df['sentence']) )


list_of_tuples = list(zip(train_data_word_list, train_data_tag_list))
df_train_word_tag = pd.DataFrame(list_of_tuples,columns=['word', 'tag'])
df_train_word_tag.head()

Unnamed: 0,word,tag
0,مریدان,N_PL
1,بسیاری,ADJ
2,داشت,V_PA
3,.,DELM
4,بعد,ADV_TIME


In [7]:
tag_count_dict = dict(df_train_word_tag['tag'].value_counts())
word_count_dict = dict(df_train_word_tag['word'].value_counts())

In [8]:
# compute  Transition Probability
def t2_given_t1(t2, t1, tag_count_dict = tag_count_dict):
    count_t1 = tag_count_dict[t1]
    count_t2_t1 = 0
    t1_indexes = df_train_word_tag[df_train_word_tag['tag']== t1].index
    t2_indexes = df_train_word_tag[df_train_word_tag['tag']== t2].index
    for i in t1_indexes:
      if(i+1 in t2_indexes):
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [9]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
tagSetLen = len(tagSet)
tagSetList = list(tagSet)
tags_matrix = np.zeros((tagSetLen, tagSetLen), dtype='float32')
for i, t1 in enumerate(tagSetList):
    for j, t2 in enumerate(tagSetList): 
        count_t2_t1, count_t1 = t2_given_t1(t2, t1)
        tags_matrix[i, j] = count_t2_t1/count_t1

In [10]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tagSet), index=list(tagSet))
display(tags_df)

Unnamed: 0,V_PA,ADV_COMP,PRO,V_AUX,P,ADV_I,V_SUB,INT,ADV_NEG,ADJ_VOC,...,DET,SYM,FW,CON,ADV_LOC,N_VOC,CLITIC,N_PL,V_IMP,DEt
V_PA,0.002403,0.000527,0.00493,0.000729,0.040476,0.001287,0.001101,0.000279,0.000698,0.0,...,0.003814,0.0,0.000171,0.251926,0.000279,0.0,0.000419,0.005286,0.000124,0.0
ADV_COMP,0.003268,0.000363,0.057734,0.000726,0.034495,0.0,0.001452,0.0,0.000363,0.0,...,0.028322,0.000726,0.0,0.01053,0.001089,0.0,0.000363,0.123457,0.0,0.0
PRO,0.051148,0.001421,0.01527,0.009238,0.216473,0.002132,0.017369,0.000331,0.001934,0.0,...,0.008313,1.7e-05,0.000165,0.098395,0.001058,0.0,0.123531,0.033465,0.001339,0.0
V_AUX,0.431439,0.001016,0.018633,8.5e-05,0.204878,0.000423,0.019226,0.0,0.000508,0.0,...,0.014822,0.0,0.0,0.0249,0.000339,0.0,0.000169,0.031253,0.000423,0.0
P,0.001035,6.5e-05,0.049232,0.000229,0.033348,0.001257,0.000988,0.0,0.001707,0.0,...,0.063696,0.000105,0.000142,0.002463,0.002608,0.0,0.000174,0.132139,3.6e-05,0.0
ADV_I,0.036004,0.0012,0.036304,0.042904,0.078608,0.0012,0.023102,0.0003,0.005701,0.0,...,0.019802,0.0,0.0,0.089709,0.0027,0.0,0.0003,0.084908,0.0003,0.0
V_SUB,0.002753,0.000776,0.006988,0.003353,0.054032,0.002965,0.00187,0.000106,0.001376,0.0,...,0.006141,0.0,0.000176,0.229539,0.000529,3.5e-05,0.000424,0.008364,0.000141,0.0
INT,0.008576,0.001715,0.020583,0.0,0.051458,0.006861,0.0,0.024014,0.0,0.0,...,0.001715,0.0,0.003431,0.039451,0.0,0.0,0.0,0.039451,0.001715,0.0
ADV_NEG,0.004179,0.001672,0.049875,0.006408,0.106715,0.000836,0.002229,0.0,0.00613,0.0,...,0.018947,0.0,0.000279,0.006687,0.000836,0.0,0.000279,0.025077,0.001115,0.0
ADJ_VOC,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
pi = {}

for tag in list(tagSet):
    pi[tag] = 0
    
for start_tag in train_df['tags'].str[0]: # train_df['tags'].str[0] : first elemen of list in all row
        if(start_tag in list(tagSet)):
            pi[start_tag] +=1 
 
for tag in list(tagSet):
    pi[tag] = pi[tag]/len(train_df['tags']) 

In [12]:
# compute Emission Probability
def word_given_tag(word, tag, df_train_word_tag = df_train_word_tag ):
    df_contain_tag = df_train_word_tag[df_train_word_tag['tag']==tag]
    count_tag = len(df_contain_tag)
    w_given_tag_list = df_contain_tag[df_contain_tag['word']==word]
    count_w_given_tag = len(w_given_tag_list)
    return (count_w_given_tag, count_tag)

In [13]:
def Viterbi(words, tagSet=tagSet, tags_df=tags_df):
    state = []
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = {}
        for tag in tagSet:
            if key == 0:
                transition_p = pi[tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            count_w_given_tag, count_tag = word_given_tag(words[key], tag)
            emission_p = count_w_given_tag / count_tag
            state_probability = emission_p * transition_p    
            p[tag] = (state_probability)
             
        # getting state for which probability is maximum
        state_max = max(p, key=p.get)
        state.append(state_max)
    return state

In [14]:
test = test_df.iloc[1] # test one sentence
test_words = list(test['sentence'])
test_tags = list(test['tags'])

In [15]:
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
test_predict = Viterbi(test_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(test_predict, test_tags) if i == j] 
 
accuracy = len(check)/len(test_predict)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  604.3981673717499
Viterbi Algorithm Accuracy:  96.29629629629629


In [16]:
for token, pred_tag, actual_tag in zip(test_words, test_predict, test_tags):
    correct = '✔' if pred_tag == actual_tag else '✘'
    print(f"{pred_tag}\t\t{actual_tag}\t\t{correct}\t\t{token}")

ADJ_SUP		ADJ_SUP		✔		نخستین
N_SING		N_SING		✔		آهنگ
P		P		✔		از
N_SING		N_SING		✔		آلبوم
N_SING		N_SING		✔		جاده
N_SING		N_SING		✔		ابریشم
CON		CON		✔		که
P		P		✔		به
DET		DET		✔		همین
N_SING		N_SING		✔		نام
P		P		✔		به
N_SING		N_SING		✔		سال
NUM		NUM		✔		۱۹۸۱
P		P		✔		برای
N_SING		N_SING		✔		تیتراژ
NUM		NUM		✔		یک
N_SING		N_SING		✔		سریال
DELM		DELM		✔		-
V_PP		V_PP		✔		ساخته
V_PA		V_PA		✔		شد
DELM		DELM		✔		،
ADV		ADV		✔		نه‌تنها
ADV		ADV		✔		کاملاً
P		P		✔		با
DET		DET		✔		این
N_SING		N_SING		✔		مسیر
DELM		DELM		✔		،
N_SING		N_SING		✔		نقش
ADJ		ADJ		✔		تاریخی
PRO		PRO		✔		آن
CON		CON		✔		و
N_SING		N_SING		✔		حماسه
V_PA		N_PL		✘		رهنوردانش
ADJ		ADJ		✔		هماهنگ
V_PRS		V_PRS		✔		است
DELM		DELM		✔		-
CON		CON		✔		بلکه
ADV		ADV		✔		حتی
CON		CON		✔		اگر
DET		DET		✔		این
N_SING		N_SING		✔		سریال
N_SING		N_SING		✔		مستند
ADJ		ADJ		✔		زیبا
CLITIC		CLITIC		✔		را
CON		CON		✔		هم
V_PP		V_PP		✔		ندیده
V_SUB		V_SUB		✔		باشیم
DELM		DELM		✔		،
P		P		✔		طی
ADJ		ADJ		✔		چند
N_SING		N_SING		✔		دقیقه
P	