In [86]:
#import necessary libraries 
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from pandas.core.common import random_state

In [87]:
# read the text file into a list of strings
with open('/data/train', 'r') as f:
    data = f.read().splitlines()

# split each string into three columns and store in a list
rows = []
for line in data:
    cols = line.split()
    if len(cols) == 3:
        rows.append(cols)

# create a pandas dataframe from the list of rows
df = pd.DataFrame(rows, columns = ["index", "token", "tag"])
df['index'] = df['index'].astype(int)

In [88]:
df

Unnamed: 0,index,token,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
...,...,...,...
204562,1,Swansea,B-ORG
204563,2,1,O
204564,3,Lincoln,B-ORG
204565,4,2,O


In [89]:
# create mappings from words and NER tags to indices
ner2idx = {'O': 0, 'B-MISC':1, 'I-MISC':2, 'I-PER':3, 'B-LOC':4, 'I-ORG':5, 'B-PER':6, 'I-LOC':7, 'B-ORG': 8}
word2idx = {'<PAD>': 0, 'unk': 1}

idx = 2  # start indexing from 2

for word in df['token']:
    if word not in word2idx:
        word2idx[word] = idx
        idx += 1

word_list = list(word2idx.keys())

In [90]:
# load the glove embeddings
glove_embeddings = {}
with open('glove.6B.100d', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = embedding

In [91]:
word_embed_matrix = []

#create a random array of size 100 to assign it to unknown words
random_array = np.concatenate((np.random.uniform(-1, 0, size=50), np.random.uniform(0, 1, size=50))) * np.random.choice([-1, 1], size=100)

def is_capitalized(word):
    return word[0].istitle()

for word in word_list:
    # check if the word is capitalized or not
    if is_capitalized(word):
        # if the word is capitalized, convert it to lower case and check if it exists in the glove embeddings
        if word.lower() in glove_embeddings:
            # if the lowercased word exists in the glove embeddings, get its embedding and concatenate a 1 to it
            word_embedding = np.concatenate([glove_embeddings[word.lower()], np.asarray([1])], axis=0)
        else:
            # if the lowercased word does not exist in the glove embeddings, assign a random array of size 100 to assign it to unknown words
            word_embedding = np.concatenate([random_array, np.asarray([1])], axis=0)
    else:
        # if the word is not capitalized, get its lowercased embedding from the glove embeddings (if it exists there) and concatenate a 0 to it
        if word.lower() in glove_embeddings:
            word_embedding = np.concatenate([glove_embeddings[word.lower()], np.asarray([0])], axis=0)
        else:
            word_embedding = np.concatenate([random_array, np.asarray([0])], axis=0) #random_array

    word_embed_matrix.append(word_embedding)

In [92]:
# initialize lists to store the encoded sentences and NER tags
encoded_sentences = []
encoded_ner_tags = []
sentence = []
ner_tags = []

# encode the sentences and NER tags into numerical representations
for i, row in df.iterrows():
    if row['index'] == 1:  # start of a new sentence
        if i > 0:
            # append the encoded sentence and NER tags to the corresponding lists
            encoded_sentences.append(sentence)
            encoded_ner_tags.append(ner_tags)
        # re-initialize the sentence and NER tags
        sentence = []
        ner_tags = []
    # encode the current word and NER tag
    sentence.append(word2idx.get(row['token'], word2idx['unk']))
    ner_tags.append(ner2idx[row['tag']])

# append the last encoded sentence and NER tags to the corresponding lists
encoded_sentences.append(sentence)
encoded_ner_tags.append(ner_tags)

# pad the sequences to have the same length using PyTorch's pad_sequence function
x = nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in encoded_sentences], batch_first=True, padding_value=0)
y = nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in encoded_ner_tags], batch_first=True, padding_value=-1)

# print the shape of the tensors
print('Encoded Sentences Shape:', x.shape)
print('Encoded NER Tags Shape:', y.shape)

Encoded Sentences Shape: torch.Size([14987, 113])
Encoded NER Tags Shape: torch.Size([14987, 113])


In [126]:
# define the model architecture
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(word_embed_matrix), freeze=False, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, 128)
        self.linear_dropout = nn.Dropout(0.2)
        self.dropout = nn.Dropout(p = dropout)
        self.activation = nn.ELU()
        self.classifier = nn.Linear(128, output_size)
        
    def forward(self, x):
        lstm_out = self.embedding(x) 
        lstm_out = self.embedding_dropout(lstm_out) 
        lstm_out, _ = self.lstm(lstm_out) 
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.linear(lstm_out)
        lstm_out = self.linear_dropout(lstm_out)
        lstm_out = self.activation(lstm_out)
        lstm_out = self.classifier(lstm_out)
        
        return lstm_out

In [127]:
# define the hyperparameters
num_epochs = 50
batch_size = 32

output_size = len(ner2idx)
vocab_size = len(word2idx)
embedding_dim = 101
hidden_dim = 256
num_layers = 1
dropout = 0.33
learning_rate = 0.5
weight = [0.4 , 1, 1, 1.5, 1, 1.4, 1.5, 1, 1.2]

In [128]:
# instantiate the model
model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout)

# set up the optimizer and loss function
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=-1, weight= torch.tensor(weight)).cuda()
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=learning_rate,
                                                total_steps=num_epochs*(x.shape[0]//batch_size + 1),
                                                anneal_strategy='linear')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

# loop through the data for the specified number of epochs
for epoch in range(num_epochs):
    epoch_loss = 0
    num_batches = 0
    
    # shuffle the data for each epoch
    indices = torch.randperm(len(x))
    x = x[indices]
    y = y[indices]
    
    # loop through the data in batches
    for i in range(0, len(x), batch_size):
        x_batch = x[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        
        # zero out the gradients
        optimizer.zero_grad()
        
        # run the model on the current batch
        outputs = model(x_batch.cuda())

        # compute the loss and update the parameters
        loss = loss_fn(outputs.view(-1, len(ner2idx)).cuda(), y_batch.view(-1).cuda())
        loss.backward()
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()
        num_batches += 1

    print('Epoch:', epoch+1 ,'Loss:', epoch_loss / num_batches)

Epoch: 1 Loss: 0.6368294587966475
Epoch: 2 Loss: 0.2861291265913418
Epoch: 3 Loss: 0.22020957263103172
Epoch: 4 Loss: 0.17746143159009756
Epoch: 5 Loss: 0.1535319134132313
Epoch: 6 Loss: 0.13279101976564825
Epoch: 7 Loss: 0.11078787096607279
Epoch: 8 Loss: 0.09852053387078649
Epoch: 9 Loss: 0.08399453629125982
Epoch: 10 Loss: 0.07613281777966568
Epoch: 11 Loss: 0.06904096802823674
Epoch: 12 Loss: 0.05686044940121297
Epoch: 13 Loss: 0.053445063213676786
Epoch: 14 Loss: 0.04569810239780845
Epoch: 15 Loss: 0.04133638273577081
Epoch: 16 Loss: 0.040145924974626095
Epoch: 17 Loss: 0.03351839942220193
Epoch: 18 Loss: 0.03047951704935173
Epoch: 19 Loss: 0.026297738981385554
Epoch: 20 Loss: 0.024992812153593356
Epoch: 21 Loss: 0.021948009636550983
Epoch: 22 Loss: 0.02131553989105812
Epoch: 23 Loss: 0.01746241953896621
Epoch: 24 Loss: 0.016528831303009448
Epoch: 25 Loss: 0.015911366991932813
Epoch: 26 Loss: 0.01393965713015553
Epoch: 27 Loss: 0.014316197254321178
Epoch: 28 Loss: 0.01167311320628

In [129]:
model_name = 'blstm2.pt'
torch.save(model.state_dict(), model_name)

In [130]:
# read the text file into a list of strings
with open('/data/dev', 'r') as f:
    data = f.read().splitlines()

# split each string into three columns and store in a list
rows = []
for line in data:
    cols = line.split()
    if len(cols) == 3:
        rows.append(cols)

# create a pandas dataframe from the list of rows
df_dev = pd.DataFrame(rows, columns = ["index", "token", "tag"])
df_dev['index'] = df_dev['index'].astype(int)


In [131]:
model_2 = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout)
model_2.load_state_dict(torch.load('/kaggle/working/' + model_name))
model_2.eval()

idx2ner = {v: k for k, v in ner2idx.items()}

# create a list to store the predicted NER tags for each word
predicted_tags = []

# initialize the encoded sentence and NER tag lists
dev_encoded_sentence = []

# loop through each row in the dataframe
for i, row in df_dev.iterrows():
    
    # check if it is the beginning of a new sentence
    if row['index'] == 1:
        
        # check if this is not the first sentence
        if i > 0:
            # encode the previous sentence and predict NER tags
            dev_encoded_sentence = torch.LongTensor(dev_encoded_sentence)
            with torch.no_grad():
                output = model_2(dev_encoded_sentence)
                predicted_tag_indices = output.argmax(dim=1)
                predicted_tags.extend([idx2ner[idx.item()] for idx in predicted_tag_indices])
            
        # re-initialize the encoded sentence and NER tag lists
        dev_encoded_sentence = []
        
    # if word is in all caps and if word has first letter as capital, consider the word in its as first letter capital form
    if row['token'].isupper() and row['token'].title() in word2idx:
        dev_encoded_sentence.append(word2idx[row['token'].title()])
    
    #if word exists in vocab, get the idx of the word 
    elif row['token'] in word2idx:
        dev_encoded_sentence.append(word2idx[row['token']])
    
    #else assign index of unk
    else:
        dev_encoded_sentence.append(word2idx['unk'])
    
# encode the last sentence and predict NER tags
dev_encoded_sentence = torch.LongTensor(dev_encoded_sentence)

with torch.no_grad():
    output = model_2(dev_encoded_sentence)
    predicted_tag_indices = output.argmax(dim=1)
    predicted_tags.extend([idx2ner[idx.item()] for idx in predicted_tag_indices])
    
# add the predicted tags to the dataframe
df_dev['pred'] = predicted_tags


In [132]:
with open('dev1_1.out', 'w') as f:
    f_to_write = ""
    first_ex = True
    count = 1
    for i_row, row in df_dev.iterrows():
        if(row['index'] == 1):
            if first_ex:
                first_ex = False
            else:
                count = 1
                f_to_write += "\n"
        f_to_write += str(count) + " " + row['token'] + " " + row['tag'] + " " + row['pred']  + "\n"
        count+=1
    f.write(f_to_write)

In [116]:
df_dev.head(30)

Unnamed: 0,index,token,tag,pred
0,1,CRICKET,O,O
1,2,-,O,O
2,3,LEICESTERSHIRE,B-ORG,B-ORG
3,4,TAKE,O,O
4,5,OVER,O,O
5,6,AT,O,O
6,7,TOP,O,O
7,8,AFTER,O,O
8,9,INNINGS,O,O
9,10,VICTORY,O,O
