In [23]:
import numpy as np
import pandas as pd

In [24]:
# Define tags for Adverb, Noun and To (the preposition) , respectively
tags = ['RB','NN','TO']

In [25]:
# Define 'transition_counts' dict
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

### Numpy for matrix creation

In [26]:
# Store the number of tags in the 'nums_tags' variable
num_tags = len(tags)

# Initialize a 3x3 numpy array with zeros
transition_matrix = np.zeros((num_tags,num_tags))

# Print matrix
transition_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [27]:
# print shape of the matrix
transition_matrix.shape

(3, 3)

In [28]:
# Create sorted version of the tag's list
sorted_tags = sorted(tags)

# Print sorted list
sorted_tags

['NN', 'RB', 'TO']

In [29]:
# Loop rows
for i in range(num_tags):
  # Loop columns
  for j in range(num_tags):
    # Define tag pair
    tag_tuple = (sorted_tags[i],sorted_tags[j])
    # Get frequency from transition_counts dict and assign to (i,j)
    # position in the matrix
    transition_matrix[i,j] = transition_counts.get(tag_tuple)

# print matrix
transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [30]:
# Define 'print_matrix' function
def print_matrix(matrix,sorted_tags):
  print(pd.DataFrame(matrix,index=sorted_tags,columns = sorted_tags))

In [31]:
# Print 'transition_matrix'
print_matrix(transition_matrix,sorted_tags)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [32]:
# Scale transition matrix
transition_matrix = transition_matrix / 10

# Print scaled matrix]
print_matrix(transition_matrix,sorted_tags)

        NN     RB     TO
NN  1624.1  243.1  525.6
RB    35.8  226.3   85.5
TO    73.4   20.0    0.2


In [33]:
# Compute sum of row for each row
rows_sum = transition_matrix.sum(axis = 1,keepdims = True)

# Print sum of rows
rows_sum

array([[2392.8],
       [ 347.6],
       [  93.6]])

In [34]:
# Normalize transition matrix
transition_matrix = transition_matrix / rows_sum

# Print normalized matrix
print_matrix(transition_matrix,sorted_tags)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [35]:
transition_matrix.sum(axis = 1,keepdims = True)

array([[1.],
       [1.],
       [1.]])

In [36]:
import math

# Copy transition matrix for for-loop example
t_matrix_for = np.copy(transition_matrix)

# Copy transition matrix for numpy functions example
t_matrix_np = np.copy(transition_matrix)

In [37]:
# Loop values in a diagonal
for i in range(num_tags):
  t_matrix_for[i,i] = t_matrix_for[i,i] + math.log(rows_sum[i])

# Print matrix
print_matrix(t_matrix_for,sorted_tags)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


  t_matrix_for[i,i] = t_matrix_for[i,i] + math.log(rows_sum[i])


In [38]:
# Using vectorization
# Save diagonal in a numpy array
d = np.diag(t_matrix_np)

# print shape of diagonal
d.shape

(3,)

In [39]:
# Reshape diagonal numpy array
d = np.reshape(d,(3,1))

# Print shape of diagonal
d.shape

(3, 1)

In [41]:
# Perform the vectorized operation
d = d + np.vectorize(math.log)(rows_sum)

# Use numpy's 'fill_diagonal' function to update the diagonal
np.fill_diagonal(t_matrix_np,d)

# Print the matrix
print_matrix(t_matrix_np,sorted_tags)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


In [42]:
# Check for equality
t_matrix_for == t_matrix_np

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])