In [32]:
import nltk
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import torch
import torch.nn as nn

#from torch_model_base import TorchModelBase
#from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNDataset, TorchRNNClassifier, TorchRNNModel
import utils

import pandas as pd
from collections import Counter

import json

from torchcrf import CRF
from sklearn.utils import shuffle

In [3]:
with open('annotations2.jsonl') as jsonl_file:
    # note: after running data-preprocessing.ipynb this file already has token-level labels
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]

In [4]:
# now get data into format that TorchRNN expects:
X=[] 
y=[]
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    auxX = []
    auxy = []
    if annot[j]['spans']!=[]: # are there annot for this example?
        for i in range(0,len(a)):
            #token_element = (a[i]['text'],a[i]['label'])
            auxX.append(a[i]['text'])
            auxy.append(a[i]['label'])
        X.append(auxX)
        y.append(auxy)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#X_train, X_test, y_train, y_test = X[:120], X[120:], y[:120], y[120:]


In [11]:
# convert X and y to ids
word_to_ix = {}
for sentence in X:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix) # i.e. each successive word added gets a successive index

In [12]:
label_to_ix = {}
for sentence in y:
    for label in sentence:
        if label not in label_to_ix:
            label_to_ix[label] = len(label_to_ix) # i.e. each successive word added gets a successive index

In [18]:
# convert X and y to indices (ints)
X_ix = [[word_to_ix[w] for w in seq] for seq in X]
y_ix = [[label_to_ix[l] for l in seq] for seq in y]

In [33]:
train_ratio = 0.75
train_test_split = round(0.75*len(X_ix) - 0.5) # -0.5 => floor
idx = [i for i in range(0,len(X_ix))]
idx_shuffle = shuffle(idx,random_state=0)
X_shuffle, y_shuffle = [X_ix[auxIdx] for auxIdx in idx_shuffle], [y_ix[auxIdx] for auxIdx in idx_shuffle]
X_train, X_test, y_train, y_test = X_shuffle[:train_test_split], X_shuffle[train_test_split:], y_shuffle[:train_test_split], y_shuffle[train_test_split:]

In [44]:
labels = sorted({y for auxVec in y_train for y in auxVec})
numTags = len(labels)
seq_lengths = [len(auxVec) for auxVec in X_train] # note: want to create mask only for inputs we're going to use to train the model
seq_length_max = max(seq_lengths)

In [25]:
def create_mask(seq_length):
    maxLen=max(seq_length)
    auxLen=len(seq_length)
    auxOne = torch.ones(maxLen)
    auxZero = torch.zeros(maxLen)
    auxOne_l=[1]*maxLen
    auxZero_l=[0]*maxLen
    auxMatrix=[]
    for i in range(auxLen):
        auxRow=auxOne_l[:seq_length[i]]+auxZero_l[seq_length[i]:]
        auxMatrix.append(auxRow)
    return torch.tensor(auxMatrix,dtype=torch.uint8)  

In [107]:
def fillList_ofLists(y,seq_length_max):
    auxMatrix = []
    for i in range(0,len(y)):
        auxRow=y[i]+[0]*(seq_length_max-len(y[i]))
        auxMatrix.append(auxRow)
    return auxMatrix 

In [108]:
mask = create_mask(seq_lengths)
X_train_forCRF = fillList_ofLists(X_train,seq_length_max) # tags filled out and converted to tensor
y_train_forCRF = fillList_ofLists(y_train,seq_length_max) # tags filled out and converted to tensor

In [125]:
print(y_train_forCRF[1])

[0, 1, 4, 1, 1, 2, 1, 0, 1, 1, 3, 1, 1, 4, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 10, 10, 1, 1, 1, 1, 1, 1, 11, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [137]:
def convert_to_Tensor_andStack(y_train_forCRF):
    auxMatrix = torch.tensor(y_train_forCRF[0])
    for i in range(1,len(y_train_forCRF)):
        auxMatrix = torch.vstack((auxMatrix,torch.tensor(y_train_forCRF[i])))
    return auxMatrix

tags = convert_to_Tensor_andStack(y_train_forCRF)

In [None]:
# Have mask and have tags
# TO DO: emissions - see description in notebook

In [None]:
torch.manual_seed(1)
#seq_length_max = 3  # maximum sequence length in a batch
batch_size = X_train_length  # number of samples in the batch
model = CRF(numTags,batch_first=True)

#emissions = torch.randn(batch_size, seq_length_max, num_tags)
#model(emissions, tags, mask=mask)
#model.decode(emissions,mask=mask)

In [109]:
def convertList_ofLists_bivariate(X_train,y_train): #stitches together 2 separate list of lists w/ identical lengths
    auxMatrix=[]
    for i in range(0,len(X_train)):
        auxRow=list(zip(X_train[i],y_train[i]))
        auxMatrix.append(auxRow)
    return auxMatrix

In [110]:
a = convertList_ofLists_bivariate(X_train_forCRF,y_train_forCRF)

In [112]:
print(a[1])

[(362, 0), (392, 1), (99, 4), (4, 1), (2, 1), (1089, 2), (4, 1), (364, 0), (284, 1), (285, 1), (1094, 3), (9, 1), (71, 1), (133, 4), (6, 1), (12, 1), (1095, 9), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (58, 1), (101, 1), (31, 1), (979, 1), (1096, 1), (21, 1), (221, 1), (26, 1), (35, 1), (39, 1), (2, 1), (103, 1), (40, 1), (41, 1), (42, 1), (43, 1), (91, 8), (21, 1), (47, 1), (54, 1), (53, 1), (55, 1), (405, 10), (245, 10), (399, 1), (136, 1), (32, 1), (59, 1), (60, 1), (27, 1), (225, 11), (96, 11), (21, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0,

In [None]:
# now convert into a 3d tensor (noExamples,maxNoTokensInSentence,noLabels)


In [120]:
a=[(1,2),(2,3)]
b = [auxA1 for auxA1,_ in a]
print(b)
c = [auxA2 for _,auxA2 in a]
print(c)
d=Counter(a)
print(d)
aux1 = a[1]

[1, 2]
[2, 3]
Counter({(1, 2): 1, (2, 3): 1})


In [None]:
# is below calc of emission probabilities actually necessary or CRF does this for us?

In [50]:
# naive emissions as simple likelihood of each word being classified to each label
print(X_train[2])
print(y_train[2])

[362, 64, 129, 363, 4, 364, 53, 98, 106, 27, 69, 365, 15, 366, 9, 43, 367, 17, 118, 21, 26, 102, 24, 39, 141, 142, 27, 69, 270, 89, 91, 21, 47, 14, 141, 142, 123, 368, 17, 21, 52, 53, 2, 54, 55, 346, 81, 21, 58, 59, 60, 27, 126, 62, 21]
[0, 1, 1, 2, 1, 0, 1, 1, 4, 1, 1, 1, 1, 3, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 1, 8, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 1, 1, 1, 11, 11, 1]


In [98]:
# now convert into count dictionary
c = Counter([auxItem for auxVec in a for auxItem in auxVec])
d=c.values()
print(sum(d)) 
print(sum(seq_lengths)) # matches ... OK
print(c)

5397
5397
Counter({(21, 1): 455, (2, 1): 175, (27, 1): 143, (17, 1): 119, (53, 1): 110, (9, 1): 95, (7, 1): 92, (4, 1): 85, (58, 1): 80, (43, 1): 78, (59, 1): 78, (54, 1): 77, (26, 1): 76, (55, 1): 75, (60, 1): 75, (52, 1): 71, (6, 1): 70, (39, 1): 64, (47, 1): 64, (18, 1): 59, (32, 1): 56, (14, 1): 54, (31, 1): 53, (15, 1): 53, (19, 1): 51, (20, 1): 51, (41, 1): 46, (42, 1): 46, (11, 1): 45, (40, 1): 44, (50, 1): 44, (49, 1): 43, (1, 1): 43, (96, 11): 41, (11, 4): 38, (29, 1): 38, (13, 1): 37, (136, 1): 35, (98, 1): 35, (69, 1): 35, (38, 1): 34, (12, 1): 32, (106, 4): 31, (62, 11): 31, (10, 1): 31, (141, 1): 30, (24, 1): 29, (64, 1): 28, (89, 1): 27, (91, 8): 26, (219, 1): 26, (216, 1): 24, (46, 1): 24, (35, 1): 23, (72, 7): 22, (37, 8): 22, (118, 1): 21, (48, 1): 21, (385, 1): 19, (37, 7): 19, (142, 1): 18, (72, 8): 18, (5, 0): 17, (27, 4): 16, (69, 4): 16, (105, 1): 16, (88, 1): 16, (22, 1): 16, (23, 1): 16, (101, 1): 15, (217, 1): 15, (861, 1): 14, (123, 1): 14, (91, 7): 14, (107, 

In [None]:
# now convert this count dict into dict /w probabilities of each word given each 

In [None]:

vocab = sorted({w for seq in X_train for w in seq}) + ["$UNK"]
vocab_length = len(X_train)
y_train_length = len(labels)

def convertCounter_toMatrix_rawCounts(c,X_train_length,y_train_length):
    auxMatrix=torch.zeros(X_train_length,y_train_length)
    # now convert into matrix format
    for i in range(0,X_train_length):
        for j in range(0,y_train_length):
            if (i,j) in c: 
                auxMatrix[i][j] = c[(i,j)]
    return auxMatrix
e = convertCounter_toMatrix_rawCounts(c,X_train_length,y_train_length)

In [94]:
print(len(e[0]))
print(y_train_length)

99
99


In [95]:
print(mask.shape)

torch.Size([99, 117])
