In [1]:
# Install all required packages
!pip install hmmlearn



In [0]:
# All imports
import pandas as pd
from collections import defaultdict, Counter
from google.colab import drive
import numpy as np
from hmmlearn import hmm
from tqdm import tqdm

In [3]:
# Connect to Google Drive. We retrieve our data from Google Drive.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Load the data.
# NOTE: Data must be uploaded to your Google Drive to be used;
# It should be available on the GitHub repo.
raw_data = pd.read_csv("drive/My Drive/types_dataset_utf8.csv")
raw_data = raw_data.fillna(method="ffill")
print(raw_data.tail(5))

  interactivity=interactivity, compiler=compiler, result=result)


               Sentence #    Word  POS Tag
12190865  Sentence: 13342     's'  NaN   O
12190866  Sentence: 13342       ;  NaN   O
12190867  Sentence: 13342  import  NaN   O
12190868  Sentence: 13342     's'  NaN   O
12190869  Sentence: 13342       ;  NaN   O


In [5]:
token_vocab = set()
tag_vocab = set()

def parse_data(data):
    """
    Parses table into an pandas series (1D array) of
    (list-of-words, list-of-tags) pairs.
    """
    construct_word_tag_pair = lambda df: (df['Word'].values.tolist(), df['Tag'].values.tolist())
    return data.groupby('Sentence #').apply(construct_word_tag_pair)

sentence_data = parse_data(raw_data)
sentence_data = sentence_data.sample(frac=1) # shuffle the dataset

test_portion = int(len(sentence_data) * 0.05)

# train on first 95% of the dataset
train_data = sentence_data[:-test_portion]

# test on last 5% of dataset
test_data = sentence_data[-test_portion:]

# temporarily count all tags in order to filter out the rare ones
tmp_tag_counts = Counter()

# collect all tokens and tags
for tokens, tags in tqdm(train_data):
    assert(len(tokens) == len(tags))
    token_vocab |= set(tokens)
    tag_vocab |= set(tags)
    tmp_tag_counts.update(tags)

# restrict number of tags in order to make things compute faster
MAX_TAGS = 100
tag_vocab = set([tag for tag, count in tmp_tag_counts.most_common(MAX_TAGS)])
tag_vocab.add('<unk>') # necessary to cutting down on number of tags in model
token_vocab.add('<unk>') # necessary for inferring new unseen tokens

token_vocab = list(sorted(token_vocab))
tag_vocab = list(sorted(tag_vocab))
print(tag_vocab)

token_map = {token: i for i, token in enumerate(token_vocab)}
tag_map = {tag: i for i, tag in enumerate(tag_vocab)}

num_tokens = len(token_vocab)
num_tags = len(tag_vocab)

# Definition of matrices:
# start_tags[i] = prob(sentence starts with tag i)
# transitions[j][i] = prob(tag i given tag j was previous)
# emissions[j][i] = prob(token i given tag j)

start_tags = np.zeros((num_tags,))
transitions = np.zeros((num_tags, num_tags))
emissions = np.zeros((num_tags, num_tokens))

for tokens, tags in tqdm(train_data):
    tags = [tag if tag in tag_vocab else '<unk>'
            for tag in tags]
    start_tags[tag_map[tags[0]]] += 1
    for i in range(len(tags) - 1):
        transitions[tag_map[tags[i]]][tag_map[tags[i + 1]]] += 1
        emissions[tag_map[tags[i]]][token_map[tokens[i]]] += 1
    emissions[tag_map[tags[len(tags) - 1]]][token_map[tokens[len(tags) - 1]]] += 1

print('token vocab length:', len(token_vocab))
print('tag vocab length:', len(tag_vocab))

100%|██████████| 12676/12676 [00:01<00:00, 10132.31it/s]
  0%|          | 59/12676 [00:00<00:24, 511.71it/s]

['$ArrayConstructor$', '$Assertion$', '$BigNumber$', '$ComponentFixture$', '$Config$', '$Console$', '$ContentServicesPage$', '$DartDebugClient$', '$Date$', '$DateConstructor$', '$Document$', '$EditorState<any>$', '$Element$', '$Equal$', '$Error$', '$ErrorConstructor$', '$Event$', '$ExchangeContract$', '$ExpectStatic$', '$FileModel$', '$FormFieldModel$', '$Function$', '$FunctionConstructor$', '$GomlNode$', '$HTMLDivElement$', '$HTMLElement$', '$HTMLInputElement$', '$IArguments$', '$IFilteringOperation$', '$IPosition$', '$IServiceContainer$', '$IServiceManager$', '$IWorkspaceService$', '$IgxColumnComponent$', '$IgxGridCellComponent$', '$IgxGridComponent$', '$IgxHierarchicalGridComponent$', '$IgxTreeGridComponent$', '$JSON$', '$KeyboardEvent$', '$Location$', '$Math$', '$Object$', '$ObjectConstructor$', '$Partial$', '$Position$', '$Promise$', '$Promise<T>$', '$Promise<any>$', '$Promise<any[]>$', '$Promise<boolean>$', '$Promise<string>$', '$Promise<unknown>$', '$Promise<void>$', '$PromiseCo

100%|██████████| 12676/12676 [00:33<00:00, 373.93it/s]

token vocab length: 94314
tag vocab length: 101





In [6]:
# general constant to prevent zeros for P(prev_tag|tag) or P(token|tag)
SMOOTHING_CONSTANT = 0.01

# add-k smoothing
print('Performing add-k smoothing...')
start_tags += SMOOTHING_CONSTANT
transitions += SMOOTHING_CONSTANT
emissions += SMOOTHING_CONSTANT

# constant to fix class imbalance
SCALE_HYPERPARAM = 0.1

# reduce probabilities of transitioning to 'O'
o_idx = tag_map['O']
print('Scaling O tag probabilities to reduce class imbalance...')
start_tags[o_idx] *= SCALE_HYPERPARAM
transitions[:, o_idx] *= SCALE_HYPERPARAM

# normalize HMM probability tables
print('Normalizing start tag probabilities...')
start_tags = start_tags / np.linalg.norm(start_tags, ord=1)

print('Normalizing previous tag probabilities...')
for i in tqdm(range(num_tags)):
    transitions[i, :] /= np.linalg.norm(transitions[i], ord=1)

print('Normalizing tag emission probabilities...')
for i in tqdm(range(num_tags)):
    emissions[i, :] /= np.linalg.norm(emissions[i], ord=1)

assert(abs(sum(start_tags) - 1) < 0.01)
assert(all([abs(sum(row) - 1) < 0.01 for row in transitions]))
assert(all([abs(sum(row) - 1) < 0.01 for row in emissions]))

100%|██████████| 101/101 [00:00<00:00, 63246.45it/s]
100%|██████████| 101/101 [00:00<00:00, 3025.24it/s]


Performing add-k smoothing...
Scaling O tag probabilities to reduce class imbalance...
Normalizing start tag probabilities...
Normalizing previous tag probabilities...
Normalizing tag emission probabilities...


In [7]:
model = hmm.MultinomialHMM(n_components=len(tag_vocab))
model.startprob_ = start_tags
model.transmat_ = transitions
model.emissionprob_ = emissions

unk_index = token_vocab.index('<unk>')
total_items_to_predict = 0.001  # prevent div by 0 if there is no test data
correctly_predicted = 0
o_idx = tag_vocab.index('O')
for i, data in tqdm(enumerate(test_data)):
    tokens, tags = data
    token_indices = [token_vocab.index(token) if token in token_vocab else unk_index for token in tokens]
    model_input = np.array([token_indices]).T
    pred_indices = model.predict(model_input, )
    pred_tags = [tag_vocab[pred_indices[i]] for i in range(len(pred_indices))]
    total_items_to_predict += sum([tag != 'O' for tag in tags])
    correctly_predicted += sum([real_tag == pred_tag for real_tag, pred_tag in zip(tags, pred_tags) if real_tag != 'O'])
    if i % 100 == 0:
        print(i, 'current accuracy:', correctly_predicted / total_items_to_predict)

print('final accuracy', correctly_predicted / total_items_to_predict)

1it [00:00,  3.48it/s]

0 current accuracy: 0.33329630041106545


101it [03:47,  1.34s/it]

100 current accuracy: 0.6354669106990555


201it [13:31,  1.55it/s]

200 current accuracy: 0.7612161006022585


301it [15:28,  3.43s/it]

300 current accuracy: 0.7482386859890838


401it [19:57,  2.02it/s]

400 current accuracy: 0.7478168611033773


501it [21:38,  2.31it/s]

500 current accuracy: 0.7406736127728282


601it [26:15,  1.22it/s]

600 current accuracy: 0.735260758267766


667it [27:34,  1.09s/it]

final accuracy 0.7312153416187809



