In [25]:
import string
from collections import defaultdict
import numpy as np
import pandas as pd

In [None]:
#load the data->from the data(get words,count freq,delete,sort) to get a vocabulary->Processing new text sources(apply assign_unk to label those not in vocab

In [2]:
#Read Text Data
with open("WSJ_02-21.pos", 'r') as f:
    lines = f.readlines()

In [3]:
# Print columns for reference
print("\t\tWord", "\tTag\n")

# Print first five lines of the dataset
for i in range(5):
    print(f'line number {i+1}: {lines[i]}')

		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



In [4]:
# Print first line (unformatted)
lines[0]

'In\tIN\n'

In [6]:
#Creating a vocabulary
# Get the words from each line in the dataset
words = [line.split('\t')[0] for line in lines]

In [12]:
# Define defaultdict of type 'int'
freq = defaultdict(int)#return the "zero" value of a type if you try to access a key that does not exist.

# Count frequency of ocurrence for each word in the dataset
for word in words:
    freq[word] += 1

In [14]:
#filter out words that appeared only once and also words that are just a newline character
#Create the vocabulary by filtering the 'freq' dictionary
vocab = [k for k, v in freq.items() if (v > 1 and k != '\n')]

In [16]:
# Sort the vocabulary
vocab.sort()

# Print some random values of the vocabulary
for i in range(4000, 4005):
    print(vocab[i])

Early
Earnings
Earth
Earthquake
East


In [None]:
########Processing new text sources

In [None]:
#unknown word tokens
#This function will do the following checks and return an appropriate token:

#Check if the unknown word contains any character that is a digit
#return --unk_digit--
#Check if the unknown word contains any punctuation character
#return --unk_punct--
#Check if the unknown word contains any upper-case character
#return --unk_upper--
#Check if the unknown word ends with a suffix that could indicate it is a noun, verb, adjective or adverb
#return --unk_noun--, --unk_verb--, --unk_adj--, --unk_adv-- respectively

In [23]:
def assign_unk(word):
    """
    Assign tokens to unknown words
    """
    
    # Punctuation characters
    # Try printing them out in a new cell!
    punct = set(string.punctuation)
    
    # Suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]

    # Loop the characters in the word, check if any is a digit
    if any(char.isdigit() for char in word):
        return "--unk_digit--"

    # Loop the characters in the word, check if any is a punctuation character
    elif any(char in punct for char in word):
        return "--unk_punct--"

    # Loop the characters in the word, check if any is an upper case character
    elif any(char.isupper() for char in word):
        return "--unk_upper--"

    # Check if word ends with any noun suffix
    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Check if word ends with any verb suffix
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Check if word ends with any adjective suffix
    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Check if word ends with any adverb suffix
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"
    
    # If none of the previous criteria is met, return plain unknown
    return "--unk--"

In [20]:
#Getting the correct tag for a word
def get_word_tag(line, vocab):
    # If line is empty return placeholders for word and tag
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        # Split line to separate word and tag
        word, tag = line.split()
        # Check if word is not in vocabulary
        if word not in vocab: 
            # Handle unknown word
            tag = assign_unk(word)
    return word, tag

In [21]:
get_word_tag('In\tIN\n', vocab)

('In', 'IN')

In [24]:
get_word_tag('scrutinize\tVB\n', vocab)#This line includes a verb that is not present in the vocabulary

('scrutinize', '--unk_verb--')

In [None]:
#################

In [27]:
#toy model tags
# Define tags for Adverb, Noun and To (the preposition) , respectively
tags = ['RB', 'NN', 'TO']

In [28]:
# Define 'transition_counts' dictionary
# Note: values are the same as the ones in the assignment
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

In [38]:
#create transition matrix
sorted_tags = sorted(tags)
transition_matrix = np.zeros((len(tags), len(tags)))

# Loop rows
for i in range(len(tags)):
    # Loop columns
    for j in range(len(tags)):
        # Define tag pair
        tag_tuple = (sorted_tags[i], sorted_tags[j])
        # Get frequency from transition_counts dict and assign to (i, j) position in the matrix
        transition_matrix[i, j] = transition_counts.get(tag_tuple)

# Print matrix
transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [39]:
# Define 'print_matrix' function
def print_matrix(matrix):
    print(pd.DataFrame(matrix, index=sorted_tags, columns=sorted_tags))

In [40]:
print_matrix(transition_matrix)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [44]:
transition_matrix = transition_matrix/10

In [None]:
#matrix manipulation

In [45]:
# Compute sum of row for each row
rows_sum = transition_matrix.sum(axis=1, keepdims=True)

# Print sum of rows
rows_sum

array([[2392.8],
       [ 347.6],
       [  93.6]])

In [46]:
# Normalize transition matrix
transition_matrix = transition_matrix / rows_sum

# Print normalized matrix
print_matrix(transition_matrix)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [47]:
#######alternative
smoothed_matrix = np.zeros_like(transition_matrix)
e = 0.001
for i in range(len(tags)):
    row_sum = np.sum(transition_matrix[i, :])  # every row's sum
    for j in range(len(tags)):
        smoothed_matrix[i, j] = (transition_matrix[i, j] + e) / (row_sum + len(tags) * e)


In [48]:
print_matrix(smoothed_matrix)

          NN        RB        TO
NN  0.677711  0.102290  0.219999
RB  0.103681  0.650085  0.246234
TO  0.782840  0.214033  0.003127


In [49]:
transition_matrix.sum(axis=1, keepdims=True)

array([[1.],
       [1.],
       [1.]])

In [50]:
import math

# Copy transition matrix for for-loop example
t_matrix_for = np.copy(transition_matrix)

# Copy transition matrix for numpy functions example
t_matrix_np = np.copy(transition_matrix)

In [52]:
# Loop values in the diagonal
for i in range(len(tags)):
    t_matrix_for[i, i] =  t_matrix_for[i, i] + math.log(rows_sum[i])

# Print matrix
print_matrix(t_matrix_for)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


  t_matrix_for[i, i] =  t_matrix_for[i, i] + math.log(rows_sum[i])


In [53]:
# Save diagonal in a numpy array
d = np.diag(t_matrix_np)

# Print shape of diagonal
d.shape

(3,)

In [54]:
# Reshape diagonal numpy array
d = np.reshape(d, (3,1))

# Print shape of diagonal
d.shape

(3, 1)

In [55]:
d#diagonal of the matrix

array([[0.67874457],
       [0.65103567],
       [0.00213675]])

In [56]:
#by applying the math.log() function to the rows_sum array and adding the diagonal
# Perform the vectorized operation
d = d + np.vectorize(math.log)(rows_sum)

# Use numpy's 'fill_diagonal' function to update the diagonal
np.fill_diagonal(t_matrix_np, d)

# Print the matrix
print_matrix(t_matrix_np)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


In [57]:
# Check for equality
t_matrix_for == t_matrix_np

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])