In [100]:
import os
import nltk
import numpy as np
import copy
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
import torch.utils.tensorboard
from torch.utils.tensorboard import SummaryWriter
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [101]:
nltk.download('punkt')

documents = {}
labels = {}

# open main data
for file in os.listdir("/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"):
    if file.endswith(".abstr"):
        content = open(("%s/%s" % ('/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training', file)), "r").read()
        documents[file.split('.')[0]] = content.split('. ')
        
# open labels        
for file in os.listdir("/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training"):
    if file.endswith(".uncontr"):
        content = open(("%s/%s" % ('/home/valeriya/Desktop/UMD/Computational_linguistic/Project/Hulth2003/Hulth2003/Training', file)), "r").read()
        labels[file.split('.')[0]] = content.split("; ")       

[nltk_data] Downloading package punkt to /home/valeriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [102]:
# tokenize document
tokenized_documents = {}
for num, ctt in documents.items():
    tokenized_documents[num] = []
    for sentence in ctt:
        words = nltk.word_tokenize(sentence.lower())
        tokenized_documents[num].append(words)

# tokenize labels
tokenized_labels = {}
for num, ctt in labels.items():
    tokenized_labels[num] = []
    for sentence in ctt:
        words = nltk.word_tokenize(sentence.lower())
        tokenized_labels[num].append(words)        

In [103]:
tokenized_labels['1346']

[['automatic', 'multilevel', 'thresholding'],
 ['image', 'segmentation'],
 ['growing', 'time', 'adaptive', 'self-organizing', 'map'],
 ['growing', 'tasom'],
 ['gtasom'],
 ['peak', 'finding', 'process']]

In [104]:
# function find index of element in list of lists
def index(lab, target):
    for i,phr in enumerate(lab):
        for j, w in enumerate(phr):
            if w == target:
                return (j)
    return (None, None)

# create dictionary of labels associated to words
class_labels = copy.deepcopy(tokenized_documents)
for document in tokenized_documents:
    text = tokenized_documents[document]
    lab = tokenized_labels[document]
    lab_flattened = [val for sublist in lab for val in sublist]
    for i, sentence in enumerate(text): 
        for j, word in enumerate(sentence): 
            is_keyphrase = word in lab_flattened
            if is_keyphrase:
                if index(tokenized_labels[document], word) == 0:
                    class_labels[document][i][j] = "first"
                else:
                    class_labels[document][i][j] = 'inside'
            else:
                class_labels[document][i][j] = 'no'

# GLOVE embeddings

In [108]:
import numpy as np
embeddings = dict()
embed_size = 100
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs
f.close()

In [109]:
# our vocab
target_vocab = set([item for sublist in X_padded for item in sublist])

#create matrix of glove vectors + random vectors for the words which are in vocab but not in glove
matrix_len = len(target_vocab)
weights_glove = np.zeros((matrix_len, 100))
words_found = 0

for i, word in enumerate(target_vocab):
    if word == 'PAD':
        print(i)
    try: 
        weights_glove[i] = embeddings[word]
        words_found += 1
    except KeyError:
        weights_glove[i] = np.random.normal(scale=0.6, size=(100, ))

10602


# Data padding

In [106]:
# for data we use sentences individually, not documents
copy.deepcopy(tokenized_documents).values()
X = [sent for doc in copy.deepcopy(tokenized_documents).values() for sent in doc]
y = [sent for doc in copy.deepcopy(class_labels).values() for sent in doc]

# padding the data
def Padding(data):
    len_max = len(max(X, key=lambda coll: len(coll)))
    for i, sentence in enumerate(data):
        len_sent = len(sentence)
        len_pad = len_max - len_sent
        sentence.extend(['PAD' for i in range(len_pad)])
    return(data)

X_padded = Padding(X)
y_padded = Padding(y)
X_lengths = len(X_padded[0])

In [115]:
for i, sent in enumerate(X_padded):
    for j, word in enumerate(sent):
        X_padded[i][j] = list(target_vocab).index(word)

In [114]:
X_padded[sent][word]

TypeError: list indices must be integers or slices, not list

# BiLSTM Net

In [89]:
len(target_vocab)

12498

In [98]:
class LSTM(nn.Module):
    def __init__(self, nb_layers = 1, nb_lstm_units=150, embedding_dim=100, batch_size=1):
        super(LSTM, self).__init__()
        self.vocab = target_vocab
        self.tags = {'PAD': 0, 'first': 1, 'inside': 2, 'no': 3}

        self.nb_lstm_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        # don't count the padding tag for the classifier output
        self.nb_tags = len(self.tags) - 1


        nb_vocab_words = len(self.vocab)

        # whenever the embedding sees the padding index it'll make the whole vector zeros
        padding_idx = 10602 #(?????)
        print(padding_idx)
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim,
            padding_idx=padding_idx
        )
        
        self.word_embedding.load_state_dict({'weight': torch.Tensor(weights_glove)})
        self.word_embedding.weight.requires_grad = False

        # design LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_lstm_layers,
            batch_first=True,
        )

        # output layer which projects back to tag space
        self.lin1 = nn.Linear(self.nb_lstm_units, nb_lstm_units)
        self.lin2 = nn.Linear(self.nb_lstm_units, self.nb_tags)

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(self.nb_lstm_layers, self.batch_size, self.nb_lstm_units).to(device)
        hidden_b = torch.randn(self.nb_lstm_layers, self.batch_size, self.nb_lstm_units).to(device)
        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X, X_lengths = 125):
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        #self.hidden = self.init_hidden()
        

        # 1. embed the input
        X = self.word_embedding(X)
        batch_size, seq_len, _ = X.size()

        # 2. Run through RNN
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        X = torch.nn.utils.rnn.pack_padded_sequence(x, X_lengths, batch_first=True)

        # now run through LSTM
        X, self.hidden = self.lstm(X, self.hidden)

        # undo the packing operation
        X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        # 3. Project to tag space
        # this one is a bit tricky as well. First we need to reshape the data so it goes into the linear layer
        X = X.contiguous()
        X = X.view(-1, X.shape[2])

        # run through actual linear layer
        X = F.relu(self.lin1(X))
        X = F.relu(self.lin2(X))

        # I like to reshape for mental sanity so we're back to (batch_size, seq_len, nb_tags)
        X = X.view(batch_size, seq_len, self.nb_tags)

        Y_hat = X
        return Y_hat

In [99]:
net = LSTM()
criterion= torch.nn.CrossEntropyLoss(size_average=True, ignore_index=0)
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

10602




In [None]:
# we do not understand what to feed to the neural network (embedding layer)

In [62]:
# Artificial example from https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [52]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ag_news_csv.tar.gz: 11.8MB [00:00, 15.0MB/s]
120000lines [00:07, 16624.30lines/s]
120000lines [00:14, 8161.90lines/s]
7600lines [00:00, 7819.62lines/s]


In [53]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


In [54]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [55]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label


In [61]:
data = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
for i, (text, offsets, cls) in enumerate(data):
    print(text)

data = 

tensor([   552,  11636,    634,  ..., 383665,  13244,  17096])
tensor([    52,    224,    117,  ..., 357480, 140625, 179648])
tensor([ 3693,  9570,   149,  ...,  5759,  6022, 34682])
tensor([38988,  1009, 11940,  ..., 25334,  9809,  3806])
tensor([   289,    557,     17,  ...,     23, 162050, 370064])
tensor([  516,   683,   303,  ...,  8579, 29744, 19464])
tensor([   27,  8557,  3463,  ..., 60397,   193,   558])
tensor([   3,  919,   24,  ..., 1259,  715,  211])
tensor([  1494,      5,   1030,  ..., 160295, 180869,    850])
tensor([   536,   2710,      8,  ..., 702777,   2085,   3629])
tensor([   1789,     377,       4,  ...,  886456, 1105467,    1204])
tensor([  599,   195,     8,  ..., 75663,   777,   682])
tensor([  1435,  27958,  28323,  ..., 127842, 296434,  26847])
tensor([41477, 24213,  2126,  ..., 91683,    37,    37])
tensor([    64,      2,    601,  ..., 723749, 769934,  13798])
tensor([   492, 103724,  20795,  ...,  88971, 124458,  56987])
tensor([ 6686,  1320,     5,  ...,

tensor([  1494,     13, 254721,  ...,  22237,   2431,    126])
tensor([  206,   566,    11,  ..., 46346,  8666, 13022])
tensor([10919,    13,    10,  ...,    26,   143,  3139])
tensor([   387,    378,   2756,  ..., 133734,   4809,  10951])
tensor([     3,    919,     24,  ...,  85345,  31405, 235372])
tensor([ 11805,    291,      5,  ...,  53772, 147732,  51085])
tensor([   1335,     210,     611,  ...,    6097, 1275846,   85598])
tensor([ 774, 5010,  537,  ...,  735,  360,  790])
tensor([   415,  19425,     17,  ..., 294265, 266240, 158765])
tensor([  3812,      4,   2672,  ...,  91310, 198619, 101482])
tensor([   2399,    1855,    6123,  ...,   18147, 1077549, 1251618])
tensor([13208,   320,     8,  ...,  1318, 20570, 15149])
tensor([1804, 2098, 1035,  ..., 5976, 6643,  211])
tensor([  2683,   6413,    537,  ...,   4735, 192129,    508])
tensor([15774,     2,   232,  ..., 67230, 68750, 68208])
tensor([     45,      78,   19501,  ...,   88925, 1237561,   16601])
tensor([  719,   966, 

tensor([ 1434,  2678, 21501,  ..., 16120,  9942,  1204])
tensor([ 39021,   1605,   1900,  ..., 161286,  13627,  41088])
tensor([  3324,    881,    812,  ...,     26, 458855, 738771])
tensor([  1666,   3812,    537,  ..., 410589, 379156,  28056])
tensor([   651,   1369,    615,  ..., 260122,  16606,   6620])
tensor([   2904,      22,   32727,  ...,   47992,  497216, 1094561])
tensor([113173, 202316,      5,  ..., 314768,   4989,   3965])
tensor([   567,   2435,   7630,  ...,  98439, 288009,  11183])
tensor([47866,    17,    10,  ..., 15019, 17416,   508])
tensor([ 15259,   5243,     91,  ..., 194869, 576273, 349883])
tensor([ 25730,      5,    615,  ...,   3671, 434083,  23884])
tensor([  4667,   1371, 799844,  ...,    925, 209353,  13708])
tensor([  2790,      4,   7036,  ...,   5061, 777412,     89])
tensor([    45,   3516,   6412,  ...,   1419, 828660, 736595])
tensor([    259,    3974,     978,  ...,  195435, 1012474,  246775])
tensor([ 983, 3567,  473,  ...,  141,  788, 2668])
tens

tensor([    34,    874,      3,  ...,  80935, 224565, 102437])
tensor([ 4341,  9369,   271,  ..., 31547,  2167,  3598])
tensor([  5413,    783,     45,  ...,   1620, 100158,  25475])
tensor([  101,  7170,   429,  ..., 36636, 34164, 17315])
tensor([ 19843,  15234,  27272,  ..., 272599,  97708,   3082])
tensor([ 10590,   2864,  42409,  ..., 576143, 712438,  49066])
tensor([  2029, 148632,    322,  ...,  19930,  12653, 980962])
tensor([  863, 84824,  1005,  ...,   108, 16135,  6090])
tensor([ 1409,   537,  1969,  ..., 99720, 60737, 11810])
tensor([  516,  5510,  1338,  ..., 90706, 38838,  9117])
tensor([127205,  30571,   2153,  ...,   4641, 464222, 408425])
tensor([16369,  1478,    21,  ...,  5055, 49829, 12291])
tensor([   102,     61,    234,  ..., 461046, 473948, 250146])
tensor([ 2733,   886,   535,  ...,  8952, 15122, 11869])
tensor([  1472,   2654,      5,  ...,  53264,  24317, 169101])
tensor([ 6534,     9,   719,  ..., 71150, 25850, 27480])
tensor([2962,   17,  266,  ..., 6673,  3

tensor([  4967,  46424,   7694,  ...,  38569, 160086,  79927])
tensor([25814,  1431,  3274,  ..., 41562,  5870, 83061])
tensor([    536,      22,    1376,  ..., 1230614,  260767,  103144])
tensor([ 512,  212,    5,  ...,   26, 2389, 3806])
tensor([  7878,   1329,    939,  ..., 304297, 195064, 126295])
tensor([  2274,    371,   2649,  ...,  32515, 376072,   6623])
tensor([  3250,   2719,  24983,  ..., 316757, 184095, 389723])
tensor([    78,     53,     11,  ..., 101286,   3453, 113224])
tensor([ 1413,    30,    47,  ..., 33709, 25851,  6623])
tensor([  3474,   4424,    759,  ...,  15827, 256631,  64162])
tensor([  206,   595,    11,  ...,  3816, 40890, 54744])
tensor([167733,  30445,    305,  ...,    462, 137864, 850525])
tensor([   727,   9024,  10110,  ..., 107250, 142395, 148543])
tensor([  8114,     13,     10,  ..., 491401, 522484, 138814])
tensor([  3647,   1425,   3555,  ...,  10609,   1440, 845637])
tensor([   8829,      17,  239635,  ..., 1189612,  119980,   31363])
tensor([26

tensor([   3, 3360,    7,  ..., 4370, 2210, 2200])
tensor([ 8621, 47586,     3,  ..., 94033, 47553,  2605])
tensor([  5774,     20,  25199,  ..., 183739, 111365,  59944])
tensor([ 3038,  1170,     5,  ..., 12788, 13589, 37154])
tensor([   271,   6902,    877,  ..., 217072,  42157, 105023])
tensor([   1031,    2799,      53,  ...,    8130,  222394, 1003280])
tensor([69255,   169,    53,  ..., 52566, 21853,    29])
tensor([  438,   628, 62817,  ...,  2755,  1354,   558])
tensor([  802,    14,    83,  ...,   428, 62861,   211])
tensor([  4601,    254,    150,  ..., 122267,    162,    508])
tensor([  368, 93775,  1369,  ..., 22013,  1101,   211])
tensor([    58,    900,  17876,  ...,  19584, 530973,    549])
tensor([1707,  720,    6,  ...,   23, 9324, 3742])
tensor([  512,   112,    45,  ...,  3793,  3878, 21672])
tensor([    8, 22715,     7,  ..., 78974,   616,   682])
tensor([3236,    4,  495,  ...,  204, 3216, 4818])
tensor([  8278,   1136,      5,  ..., 329347,   2958,   9574])
tensor(

tensor([   101,      5,  14662,  ..., 543083, 734701, 132039])
tensor([ 2918,  3339,  2696,  ..., 11874,  4196,  6479])
tensor([  9374,   3477,    763,  ..., 524104, 523402, 521733])
tensor([2966,   13,   10,  ...,  177,  162,  508])
tensor([   1944,    4314,    3259,  ...,     207, 1209404,  761916])
tensor([  656,  1429,   328,  ..., 81759, 10029, 17713])
tensor([  101,   794,  3835,  ...,  3052, 47306, 20356])
tensor([ 3477, 76477,    11,  ...,  1564,  1877,  4261])
tensor([  278,  5123,  6400,  ..., 82087,  7376, 19934])
tensor([612665,  10063,   1427,  ...,   4597,    457,   2491])
tensor([ 2373,     2,  5088,  ..., 50490,   435,   949])
tensor([  5737,   2132,    387,  ...,  79965, 824963,  79980])
tensor([  5655,   1124,   7516,  ..., 703562, 348010,   8778])
tensor([   175,   3587,  14730,  ..., 224672,   5445,    211])
tensor([  2945,   1522,      7,  ..., 230222, 697311,  77302])
tensor([ 5080,  4394,  3942,  ..., 21715,   360,   790])
tensor([11768,  4410,     5,  ...,  7093

tensor([    64,      2,     10,  ..., 281802,     37,     37])
tensor([  719,    13,    10,  ...,  8415, 16837,   850])
tensor([   505,  11065,  21516,  ..., 331926,  63657, 628081])
tensor([ 1362,    13,    10,  ..., 10817, 18312,   211])
tensor([  10,    2,  941,  ..., 1387, 1306, 3705])
tensor([  1655,   1815,   6621,  ...,   1104, 351532,  98954])
tensor([  592, 22469,  1086,  ...,  5266,  2224, 11081])
tensor([  2196,   7022,     34,  ..., 191905, 517515,    790])
tensor([  16880,    1175,     883,  ..., 1305987,   11159,      89])
tensor([  5041,   1285,    343,  ...,    786, 201405,  11387])
tensor([10207,  1509,  2683,  ...,    29,  1011,   682])
tensor([   7695,    1946,     789,  ..., 1135531, 1186927,   16390])
tensor([ 3287,     6, 14679,  ...,  7865,    37,    37])
tensor([12520,  2060,   476,  ..., 14322, 12731, 11152])
tensor([  6129,   4059,    885,  ...,    180, 191571, 330062])
tensor([  835,   454,  1135,  ...,  4881, 20268, 30909])
tensor([ 20353,   1880,    987,  .

tensor([  528,   273,   110,  ...,    26, 23359, 64431])
tensor([   401,   2894,   1218,  ..., 201002,  84837,  96754])
tensor([  4876,   1192,   2229,  ..., 126948,  11101,   8409])
tensor([  1019,   2505,      5,  ..., 423969,     37,     37])
tensor([ 29784,  12005,      8,  ..., 128825,   1542, 188634])
tensor([103391,    390,     13,  ..., 950254, 427398,  46293])
tensor([  368,   212, 15935,  ..., 27073, 10288,  1938])
tensor([18063,  1136,   198,  ...,  1844,  1929, 10759])
tensor([616365,   3526,   4029,  ...,  33385,  41504,   2862])
tensor([    45,   2961,      5,  ..., 366227, 330149,   4226])
tensor([719, 811,   4,  ..., 177, 193, 558])
tensor([    24,     30,    320,  ...,  83436,    152, 104083])
tensor([    913,  605028,     313,  ...,  817388, 1041919,     534])
tensor([   505,    539,    137,  ...,  51977, 439635,  32095])
tensor([ 84258, 323933,      4,  ...,  31022,  28984,  30953])
tensor([    913,     667,    3731,  ..., 1083332,  932939,   57222])
tensor([   433, 

TypeError: 'DataLoader' object does not support indexing