In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
# read the text file into a list of strings
with open('/data/train', 'r') as f:
    data = f.read().splitlines()

# split each string into three columns and store in a list
rows = []
for line in data:
    cols = line.split()
    if len(cols) == 3:
        rows.append(cols)

# create a pandas dataframe from the list of rows
df = pd.DataFrame(rows, columns = ["index", "token", "tag"])
df['index'] = df['index'].astype(int)

In [23]:
df

Unnamed: 0,index,token,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
...,...,...,...
204562,1,Swansea,B-ORG
204563,2,1,O
204564,3,Lincoln,B-ORG
204565,4,2,O


In [24]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = 2  # start indexing from 2

for word in df['token']:
    if word not in word2idx:
        word2idx[word] = idx
        idx += 1

In [25]:
ner2idx = {'O': 0, 'B-MISC':1, 'I-MISC':2, 'I-PER':3, 'B-LOC':4, 'I-ORG':5, 'B-PER':6, 'I-LOC':7, 'B-ORG': 8}

# initialize lists to store the encoded sentences and NER tags
encoded_sentences = []
encoded_ner_tags = []
sentence = []
ner_tags = []


# encode the sentences and NER tags into numerical representations
for i, row in df.iterrows():
    if row['index'] == 1:  # start of a new sentence
        if i > 0:
            # append the encoded sentence and NER tags to the corresponding lists
            encoded_sentences.append(sentence)
            encoded_ner_tags.append(ner_tags)
        # re-initialize the sentence and NER tags
        sentence = []
        ner_tags = []
    # encode the current word and NER tag
    sentence.append(word2idx.get(row['token'], word2idx['<UNK>']))
    ner_tags.append(ner2idx[row['tag']])

# append the last encoded sentence and NER tags to the corresponding lists
encoded_sentences.append(sentence)
encoded_ner_tags.append(ner_tags)

# pad the encoded sentences and NER tags to the maximum length
max_sentence_length = max([len(sentence) for sentence in encoded_sentences])

# convert the encoded sentences and NER tags to PyTorch tensors
x = [torch.LongTensor(sent) for sent in encoded_sentences]
y = [torch.LongTensor(tags) for tags in encoded_ner_tags]

# pad the sequences to have the same length using PyTorch's pad_sequence function
max_len = max_sentence_length  # or any other desired length
x = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0)
y = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=-1)

# print the shape of the tensors
print('Encoded Sentences Shape:', x.shape)
print('Encoded NER Tags Shape:', y.shape)

Encoded Sentences Shape: torch.Size([14987, 113])
Encoded NER Tags Shape: torch.Size([14987, 113])


### MODEL TRAINING

In [26]:
# define the model architecture
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) #.from_pretrained
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, 128)
        self.dropout = nn.Dropout(p = dropout)
        self.activation = nn.ELU()
        self.classifier = nn.Linear(128, output_size)


    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.linear(lstm_out)
        lstm_out = self.activation(lstm_out)
        lstm_out = self.classifier(lstm_out)
        
        return lstm_out

In [29]:
# define the hyperparameters
num_epochs = 120
batch_size = 8

vocab_size = len(word2idx)
output_size = len(ner2idx)
embedding_dim = 100
hidden_dim = 256
num_layers = 1
dropout = 0.33
learning_rate = 0.5
weight = [0.7 , 1, 1, 1.5, 1, 1, 1.5, 1, 1.2] 

In [30]:
# instantiate the model
model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout)

# set up the optimizer and loss function
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=-1, weight= torch.tensor(weight)).cuda()
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=learning_rate,
                                                total_steps=num_epochs*(x.shape[0]//batch_size + 1),
                                                anneal_strategy='linear')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

# loop through the data for the specified number of epochs
for epoch in range(num_epochs):
    print('Epoch:', epoch+1)
    epoch_loss = 0
    num_batches = 0
    
    # shuffle the data for each epoch
    indices = torch.randperm(len(x))
    x = x[indices]
    y = y[indices]
    
    # loop through the data in batches
    for i in range(0, len(x), batch_size):
        x_batch = x[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        
        # zero out the gradients
        optimizer.zero_grad()
        
        # run the model on the current batch
        outputs = model(x_batch.cuda())

        # compute the loss and update the parameters
        loss = loss_fn(outputs.view(-1, len(ner2idx)).cuda(), y_batch.view(-1).cuda())
        loss.backward()
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()
        num_batches += 1

    print('Loss:', epoch_loss / num_batches)

Epoch: 1
Loss: 0.7162939658668787
Epoch: 2
Loss: 0.3788956966501608
Epoch: 3
Loss: 0.24828953139648924
Epoch: 4
Loss: 0.1837484407208347
Epoch: 5
Loss: 0.14175209745817163
Epoch: 6
Loss: 0.11480066830825593
Epoch: 7
Loss: 0.10024724021532126
Epoch: 8
Loss: 0.09020604242494537
Epoch: 9
Loss: 0.08094813601388308
Epoch: 10
Loss: 0.0752748068520918
Epoch: 11
Loss: 0.06962458150826248
Epoch: 12
Loss: 0.06404324987229346
Epoch: 13
Loss: 0.06288156728777573
Epoch: 14
Loss: 0.06140028297850251
Epoch: 15
Loss: 0.059183932386798664
Epoch: 16
Loss: 0.049913060853970494
Epoch: 17
Loss: 0.048876101991341105
Epoch: 18
Loss: 0.043591474900831415
Epoch: 19
Loss: 0.04079056744365392
Epoch: 20
Loss: 0.03821214825235099
Epoch: 21
Loss: 0.032324126045963804
Epoch: 22
Loss: 0.029830537009650125
Epoch: 23
Loss: 0.023686499797772875
Epoch: 24
Loss: 0.025547799429054958
Epoch: 25
Loss: 0.020433832301120575
Epoch: 26
Loss: 0.02220685251581328
Epoch: 27
Loss: 0.021666032454547554
Epoch: 28
Loss: 0.0177911392100

In [31]:
model_name = 'blstm1.pt'

torch.save(model.state_dict(), model_name)

## Predictions on Dev

In [32]:
# read the text file into a list of strings
with open('/data/dev', 'r') as f:
    data = f.read().splitlines()

# split each string into three columns and store in a list
rows = []
for line in data:
    cols = line.split()
    if len(cols) == 3:
        rows.append(cols)

# create a pandas dataframe from the list of rows
df_dev = pd.DataFrame(rows, columns = ["index", "token", "tag"])
df_dev['index'] = df_dev['index'].astype(int)

In [33]:
model_2 = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_size, num_layers, dropout)
model_2.load_state_dict(torch.load(model_name))
model_2.eval()

idx2ner = {v: k for k, v in ner2idx.items()}

# create a list to store the predicted NER tags for each word
predicted_tags = []

# initialize the encoded sentence and NER tag lists
encoded_sentence = []
ner_tags = []

# loop through each row in the dataframe
for i, row in df_dev.iterrows():
    
    # check if it is the beginning of a new sentence
    if row['index'] == 1:
        
        # check if this is not the first sentence
        if i > 0:
            
            # encode the previous sentence and predict NER tags
            encoded_sentence = torch.LongTensor(encoded_sentence)
            with torch.no_grad():
                output = model_2(encoded_sentence)
                predicted_tag_indices = output.argmax(dim=1)
                predicted_tags.extend([idx2ner[idx.item()] for idx in predicted_tag_indices])
            
        # re-initialize the encoded sentence and NER tag lists
        encoded_sentence = []
        
    # encode the current word
    encoded_sentence.append(word2idx.get(row['token'], word2idx['<UNK>']))
    
# encode the last sentence and predict NER tags
encoded_sentence = torch.LongTensor(encoded_sentence)
with torch.no_grad():
    output = model_2(encoded_sentence)
    predicted_tag_indices = output.argmax(dim=1)
    predicted_tags.extend([idx2ner[idx.item()] for idx in predicted_tag_indices])
    
# add the predicted tags to the dataframe
df_dev['pred'] = predicted_tags

In [34]:
df_dev

Unnamed: 0,index,token,tag,pred
0,1,CRICKET,O,O
1,2,-,O,O
2,3,LEICESTERSHIRE,B-ORG,O
3,4,TAKE,O,O
4,5,OVER,O,O
...,...,...,...,...
51573,1,--,O,O
51574,2,Dhaka,B-ORG,B-ORG
51575,3,Newsroom,I-ORG,I-ORG
51576,4,880-2-506363,O,O


In [35]:
with open('dev1.out', 'w') as f:
    f_to_write = ""
    first_ex = True
    count = 1
    for i_row, row in df_dev.iterrows():
        if(row['index'] == 1):
            if first_ex:
                first_ex = False
            else:
                count = 1
                f_to_write += "\n"
        f_to_write += str(count) + " " + row['token'] + " " + row['tag'] + " " + row['pred']  + "\n"
        count+=1
    f.write(f_to_write)