In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train_data.csv')

In [3]:
data.head()

Unnamed: 0,ID,UTTERANCES,IOB SLOT TAGS
0,0,who plays luke on star wars new hope,O O B_char O B_movie I_movie I_movie I_movie
1,1,show credits for the godfather,O O O B_movie I_movie
2,2,who was the main actor in the exorcist,O O O O O O B_movie I_movie
3,3,find the female actress from the movie she's t...,O O O O O O O B_movie I_movie I_movie I_movie
4,4,who played dory on finding nemo,O O B_char O B_movie I_movie


In [4]:
data.columns = ['ID', 'input', 'labels']

In [5]:
data.iloc[0]

ID                                                   0
input             who plays luke on star wars new hope
labels    O O B_char O B_movie I_movie I_movie I_movie
Name: 0, dtype: object

In [6]:
def split_input(data):
    output = []
    for i in range(len(data)):
      input = data.iloc[i]['input']
      labels = data.iloc[i]['labels']
      input_list = input.split(" ")
      labels_list = labels.split(" ")
      if len(input_list)!= len(labels_list):
        continue
      output.append((input_list, labels_list))
    return output

all_data = split_input(data)

In [7]:
labels = [x[1] for x in all_data]
unique_labels = set([item for sublist in labels for item in sublist])

print(unique_labels)
print(len(unique_labels))

{'I_cast', 'B_cast', 'B_subject', 'I_director', 'B_genre', 'B_country', 'B_person', 'I_genre', 'O', 'B_language', 'B_char', 'I_char', 'I-movie', 'B_producer', 'I_mpaa_rating', 'B_mpaa_rating', 'I_person', 'I_movie', 'B_location', 'I_country', 'I_release_year', 'I_producer', 'I_language', 'B_director', 'I_subject', 'B_release_year', 'B_movie'}
27


In [8]:
import torch

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def word_to_idx(data):
    w_dict = {}
    for text, label in data:
        for word in text:
            if word not in w_dict:
                w_dict[word] = len(w_dict)
    return w_dict

def label_to_idx(labels):
    l_dict = {}
    for idx, label in enumerate(labels):
        l_dict[label] = idx
    return l_dict    

In [9]:
word_to_index = word_to_idx(all_data)
label_to_index = label_to_idx(unique_labels)

In [10]:
data_test = pd.read_csv('test_data.csv')
data_test.columns = ['ID', 'input']

In [11]:
def split_input_test(data):
    output = []
    for i in range(len(data)):
      input = data.iloc[i]['input']
      input_list = input.split(" ")
      output.append((input_list))
    return output
    
test_data = split_input_test(data_test)

In [13]:
# update vocab
for text in test_data:
        for word in text:
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)

In [14]:
import torch.nn as nn
import torch.nn.functional as F

class myRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim):
        super(myRNN, self).__init__()

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        
    def forward(self, sentence):
        embeddings = self.embeddings(sentence)

        rnn_output, hidden = self.rnn(embeddings.view(len(sentence), 1, -1))
        
        score = F.log_softmax(self.fc(rnn_output.view(len(sentence), -1)), dim=1)
        
        return score

In [15]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(all_data, test_size=0.2, random_state=69)

In [16]:
from torch import optim

embed_dim = 64
hidden_dim = 64

my_rnn = myRNN(embed_dim, hidden_dim, len(word_to_index), len(label_to_index))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(my_rnn.parameters(), lr=0.001)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [17]:
num_epochs = 30

In [18]:
def train_func(training_data):
    my_rnn.to(device)

    for epoch in range(num_epochs):
        my_rnn.train()
        train_loss = 0.0
        for text, label in training_data:
            my_rnn.zero_grad()
            
            input_sentence = prepare_sequence(text, word_to_index)
            input_sentence = input_sentence.to(device)

            targets = prepare_sequence(label, label_to_index)
            targets = targets.to(device)

            tag_scores = my_rnn(input_sentence)

            loss = loss_function(tag_scores, targets)
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        print('Training loss: ', (train_loss/len(training_data)))

In [19]:
train_func(train_data)

Training loss:  0.8790209114401147
Training loss:  0.3925950536130011
Training loss:  0.2526272682777959
Training loss:  0.17488529658142465
Training loss:  0.1238094436763654
Training loss:  0.09067300626338692
Training loss:  0.06862343872638461
Training loss:  0.056923023102178574
Training loss:  0.048615219359148025
Training loss:  0.04260647532446274
Training loss:  0.037840300829996704
Training loss:  0.03337463187706784
Training loss:  0.03177672836982078
Training loss:  0.031130641928134167
Training loss:  0.028470795618132472
Training loss:  0.026794783963932697
Training loss:  0.025830966847667627
Training loss:  0.02689362830190015
Training loss:  0.024247856972061083
Training loss:  0.022752538463858503
Training loss:  0.023747960575094044
Training loss:  0.02286595752062894
Training loss:  0.022105558740689653
Training loss:  0.022511497536909
Training loss:  0.022883285509003686
Training loss:  0.02308952137284272
Training loss:  0.020184105498665356
Training loss:  0.021

In [20]:
def fix_labels(labels):
    new_labels = []
    for i in range(len(labels)):
      curr_label = labels[i]
      new_label = curr_label.replace("_", "-")
      new_labels.append(new_label)
    return new_labels

In [21]:
from seqeval.metrics import classification_report

def evaluate_func(dataset):
    my_rnn.eval()

    all_predictions = []
    all_targets = []

    for i in range(len(dataset)):
      my_rnn.eval()

      inputs = prepare_sequence(val_data[i][0], word_to_index)
      inputs = inputs.to(device)

      scores = my_rnn(inputs)

      index_to_label = dict((v, k) for k, v in label_to_index.items())

      preds = [torch.max(x, 0)[1].item() for x in scores]
      correct = prepare_sequence(val_data[i][1], label_to_index)

      original_sentence = val_data[i][0]
      correct_labels = [index_to_label[c.item()] for c in correct]
      predicted_labels = [index_to_label[p] for p in preds]

      correct_labels = fix_labels(correct_labels)
      predicted_labels = fix_labels(predicted_labels)

      all_predictions.append(predicted_labels)
      all_targets.append(correct_labels)

    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    # print(all_predictions.shape)
    # print(all_targets.shape)
    # print(all_predictions[0])
    # print(all_targets[0])
    
    print(classification_report(all_targets, all_predictions))

In [22]:
evaluate_func(val_data)

              precision    recall  f1-score   support

        cast       0.41      0.65      0.50        17
        char       1.00      0.40      0.57         5
     country       0.84      0.66      0.74        32
    director       0.40      0.61      0.49        28
       genre       0.83      0.67      0.74        15
    language       0.67      0.82      0.73        22
    location       0.00      0.00      0.00         1
       movie       0.63      0.79      0.70       197
 mpaa-rating       1.00      0.85      0.92        26
      person       0.58      0.60      0.59        42
    producer       0.74      0.70      0.72        40
release-year       0.00      0.00      0.00         1
     subject       0.79      0.83      0.81        18

   micro avg       0.64      0.73      0.68       444
   macro avg       0.61      0.58      0.58       444
weighted avg       0.67      0.73      0.69       444



  all_predictions = np.array(all_predictions)
  all_targets = np.array(all_targets)
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
data_test = pd.read_csv('test_data.csv')
data_test.columns = ['ID', 'input']

In [24]:
def split_input_test(data):
    output = []
    for i in range(len(data)):
      input = data.iloc[i]['input']
      input_list = input.split(" ")
      output.append((input_list))
    return output
    
test_data = split_input_test(data_test)

In [25]:
def get_predictions_tag(model, test_data):
    all_predictions = []
    for i in range(len(test_data)):
      inputs = prepare_sequence(test_data[i], word_to_index)
      inputs = inputs.to(device)

      scores = my_rnn(inputs)

      ix_to_tag = dict((v, k) for k, v in label_to_index.items())

      preds = [torch.max(x, 0)[1].item() for x in scores]
      
      predicted_labels = [ix_to_tag[p] for p in preds]


      all_predictions.append(predicted_labels)
   
    all_predictions = np.array(all_predictions)
    return all_predictions


In [26]:
test_preds = get_predictions_tag(my_rnn, test_data)

  all_predictions = np.array(all_predictions)


In [27]:
sub_rnn = data_test[['ID']]
sub_rnn['IOB Slot tags'] = test_preds
sub_rnn['IOB Slot tags'] = sub_rnn['IOB Slot tags'].apply(lambda x: " ".join(x))

In [28]:
sub_rnn.head()

Unnamed: 0,ID,IOB Slot tags
0,0,B_movie O B_movie
1,1,O O O O O B_movie I_movie
2,2,O O O O O O B_movie I_movie
3,3,O O O B_movie
4,4,O O O B_movie


In [29]:
sub_rnn.to_csv('submission_rnn.csv', index=False)