In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train_data.csv')

In [3]:
data.head()

Unnamed: 0,ID,UTTERANCES,IOB SLOT TAGS
0,0,who plays luke on star wars new hope,O O B_char O B_movie I_movie I_movie I_movie
1,1,show credits for the godfather,O O O B_movie I_movie
2,2,who was the main actor in the exorcist,O O O O O O B_movie I_movie
3,3,find the female actress from the movie she's t...,O O O O O O O B_movie I_movie I_movie I_movie
4,4,who played dory on finding nemo,O O B_char O B_movie I_movie


In [4]:
data.columns = ['ID', 'input', 'labels']

In [5]:
data.iloc[0]

ID                                                   0
input             who plays luke on star wars new hope
labels    O O B_char O B_movie I_movie I_movie I_movie
Name: 0, dtype: object

In [6]:
def split_input(data):
    output = []
    for i in range(len(data)):
      input = data.iloc[i]['input']
      labels = data.iloc[i]['labels']
      input_list = input.split(" ")
      labels_list = labels.split(" ")
      if len(input_list)!= len(labels_list):
        continue
      output.append((input_list, labels_list))
    return output

all_data = split_input(data)

In [7]:
labels = [x[1] for x in all_data]
unique_labels = set([item for sublist in labels for item in sublist])

print(unique_labels)
print(len(unique_labels))

{'B_mpaa_rating', 'O', 'I_mpaa_rating', 'B_subject', 'I_producer', 'I_director', 'B_language', 'B_location', 'I_char', 'B_person', 'I_genre', 'B_director', 'I_subject', 'B_genre', 'B_cast', 'B_country', 'I_country', 'I_person', 'I_cast', 'B_char', 'B_producer', 'I_movie', 'B_release_year', 'I-movie', 'B_movie', 'I_language', 'I_release_year'}
27


In [8]:
import torch

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def word_to_idx(data):
    w_dict = {}
    for text, label in data:
        for word in text:
            if word not in w_dict:
                w_dict[word] = len(w_dict)
    return w_dict

def label_to_idx(labels):
    l_dict = {}
    for idx, label in enumerate(labels):
        l_dict[label] = idx
    return l_dict    

In [9]:
word_to_index = word_to_idx(all_data)
label_to_index = label_to_idx(unique_labels)

In [10]:
data_test = pd.read_csv('test_data.csv')
data_test.columns = ['ID', 'input']

In [11]:
def split_input_test(data):
    output = []
    for i in range(len(data)):
      input = data.iloc[i]['input']
      input_list = input.split(" ")
      output.append((input_list))
    return output
    
test_data = split_input_test(data_test)

In [12]:
# update vocab
for text in test_data:
        for word in text:
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)

In [13]:
import torch.nn as nn
import torch.nn.functional as F

class myBiLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, num_layers=2):
        super(myBiLSTM, self).__init__()

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, self.num_layers, bidirectional=True)
        
        self.fc = nn.Linear(hidden_dim*2, tagset_size)
        
        
    def forward(self, sentence):
        embeddings = self.embeddings(sentence)
        
        lstm_output, hidden = self.lstm(embeddings.view(len(sentence), 1, -1))
        
        scores = F.log_softmax(self.fc(lstm_output.view(len(sentence), -1)), dim=1)
        
        return scores

In [14]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(all_data, test_size=0.2, random_state=69)

In [15]:
from torch import optim

embed_dim = 256
hidden_dim = 256

my_lstm = myBiLSTM(embed_dim, hidden_dim, len(word_to_index), len(label_to_index))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(my_lstm.parameters(), lr=0.001)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [19]:
num_epochs = 10

In [20]:
def train_func(training_data):
    my_lstm.to(device)

    for epoch in range(num_epochs):
        my_lstm.train()
        train_loss = 0.0
        for text, label in training_data:
            my_lstm.zero_grad()
            
            input_sentence = prepare_sequence(text, word_to_index)
            input_sentence = input_sentence.to(device)

            targets = prepare_sequence(label, label_to_index)
            targets = targets.to(device)

            tag_scores = my_lstm(input_sentence)

            loss = loss_function(tag_scores, targets)
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        print('Training loss: ', (train_loss/len(training_data)))

In [21]:
train_func(train_data)

Training loss:  0.012700999989651282
Training loss:  0.006812812718138694
Training loss:  0.004840471948209886
Training loss:  0.0036334216063554805
Training loss:  0.014289893415402223
Training loss:  0.008143151310229056
Training loss:  0.011621099210026789
Training loss:  0.002379686983594412
Training loss:  0.0013637299515717538
Training loss:  0.0013774915250204063


In [22]:
def fix_labels(labels):
    new_labels = []
    for i in range(len(labels)):
      curr_label = labels[i]
      new_label = curr_label.replace("_", "-")
      new_labels.append(new_label)
    return new_labels

In [23]:
from seqeval.metrics import classification_report

def evaluate_func(dataset):
    my_lstm.eval()

    all_predictions = []
    all_targets = []

    for i in range(len(dataset)):
      my_lstm.eval()

      inputs = prepare_sequence(val_data[i][0], word_to_index)
      inputs = inputs.to(device)

      scores = my_lstm(inputs)

      index_to_label = dict((v, k) for k, v in label_to_index.items())

      preds = [torch.max(x, 0)[1].item() for x in scores]
      correct = prepare_sequence(val_data[i][1], label_to_index)

      original_sentence = val_data[i][0]
      correct_labels = [index_to_label[c.item()] for c in correct]
      predicted_labels = [index_to_label[p] for p in preds]

      correct_labels = fix_labels(correct_labels)
      predicted_labels = fix_labels(predicted_labels)

      all_predictions.append(predicted_labels)
      all_targets.append(correct_labels)

    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    # print(all_predictions.shape)
    # print(all_targets.shape)
    # print(all_predictions[0])
    # print(all_targets[0])
    
    print(classification_report(all_targets, all_predictions))

In [24]:
evaluate_func(val_data)

              precision    recall  f1-score   support

        cast       0.81      0.76      0.79        17
        char       0.60      0.60      0.60         5
     country       0.83      0.91      0.87        32
    director       0.57      0.86      0.69        28
       genre       0.73      0.73      0.73        15
    language       0.75      0.82      0.78        22
    location       0.00      0.00      0.00         1
       movie       0.85      0.89      0.87       197
 mpaa-rating       1.00      0.85      0.92        26
      person       0.76      0.62      0.68        42
    producer       0.79      0.85      0.82        40
release-year       0.00      0.00      0.00         1
     subject       0.81      0.94      0.87        18

   micro avg       0.80      0.84      0.82       444
   macro avg       0.65      0.68      0.66       444
weighted avg       0.81      0.84      0.82       444



  all_predictions = np.array(all_predictions)
  all_targets = np.array(all_targets)
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
data_test = pd.read_csv('test_data.csv')
data_test.columns = ['ID', 'input']

In [26]:
def split_input_test(data):
    output = []
    for i in range(len(data)):
      input = data.iloc[i]['input']
      input_list = input.split(" ")
      output.append((input_list))
    return output
    
test_data = split_input_test(data_test)

In [27]:
def get_predictions_tag(model, test_data):
    all_predictions = []
    for i in range(len(test_data)):
      inputs = prepare_sequence(test_data[i], word_to_index)
      inputs = inputs.to(device)

      scores = my_lstm(inputs)

      ix_to_tag = dict((v, k) for k, v in label_to_index.items())

      preds = [torch.max(x, 0)[1].item() for x in scores]
      
      predicted_labels = [ix_to_tag[p] for p in preds]


      all_predictions.append(predicted_labels)
   
    all_predictions = np.array(all_predictions)
    return all_predictions


In [29]:
test_preds = get_predictions_tag(my_lstm, test_data)

  all_predictions = np.array(all_predictions)


In [30]:
sub_lstm = data_test[['ID']]
sub_lstm['IOB Slot tags'] = test_preds
sub_lstm['IOB Slot tags'] = sub_lstm['IOB Slot tags'].apply(lambda x: " ".join(x))

In [31]:
sub_lstm.head()

Unnamed: 0,ID,IOB Slot tags
0,0,O O B_movie
1,1,O O O O O B_movie I_movie
2,2,O O O O O O B_movie I_movie
3,3,O O O B_movie
4,4,O O O B_language


In [32]:
sub_lstm.to_csv('submission_lstm.csv', index=False)