### Download meta.json and test set 

In [1]:
import gdown


### Please, uncomment and change url below if downloading Test file from drive link

#valid4test_url = 'https://drive.google.com/uc?id=1T5UFbIWq8IA5ox0upGcpxtTRyJwakxwI'
#gdown.download(valid4test_url, './test.json')

# Download meta.json
meta_url = 'https://drive.google.com/uc?id=15Z_ziRMWthc2GMjeg612wAHbPA451CBu' #meta.json from my drive
gdown.download(meta_url, './meta.json')

Downloading...
From: https://drive.google.com/uc?id=1T5UFbIWq8IA5ox0upGcpxtTRyJwakxwI
To: /content/test.json
100%|██████████| 112k/112k [00:00<00:00, 48.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=15Z_ziRMWthc2GMjeg612wAHbPA451CBu
To: /content/meta.json
100%|██████████| 147k/147k [00:00<00:00, 82.0MB/s]


'./meta.json'

### Download pretrained embedding and unzip

In [2]:
!wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
!unzip glove.6B.zip

--2022-12-21 12:18:44--  http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/wordvecs/glove.6B.zip [following]
--2022-12-21 12:18:44--  https://nlp.stanford.edu/data/wordvecs/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.6B.zip [following]
--2022-12-21 12:18:45--  https://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182753 (822M) [app

### Import needed libraries

In [3]:
import json
import torch
import pandas as pd
from torch import nn
import numpy as np
from torch.utils.data import Dataset, DataLoader 
import time
import torch.nn.functional as F
import itertools
from collections import Counter

### Load embedding, meta data and set device

In [4]:
glove = pd.read_csv('glove.6B.100d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove_embedding = {key: val.values for key, val in glove.T.items()}


vocabulary_path = './meta.json'
input_data = json.load(open(vocabulary_path, 'r'))
vocabulary = input_data['tokens']
#print(len(vocabulary))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Create embedding matrix and index dictionary

In [5]:
def create_embedding_matrix(vocabulary,embedding_dict,dimension):
  
  embedding_matrix=np.zeros((len(vocabulary)+1,dimension))
 
  index = 1
  total_added = 0
  for word in vocabulary:
    
    if word in embedding_dict:
      
      embedding_matrix[index]=embedding_dict[word]
      vd =dict(list(enumerate(vocabulary)))
      dictionary = dict((v,k) for k,v in vd.items())
            
      index += 1
      total_added += 1 

  #print("total added words: ", total_added)
  return embedding_matrix, dictionary

embedding_matrix, dictionary=create_embedding_matrix(vocabulary,embedding_dict=glove_embedding,dimension=100)
embedding_matrix = torch.tensor(embedding_matrix)
#print(embedding_matrix.shape)

#print(dict(itertools.islice(dictionary.items(), 10)))
dictionary = Counter(dictionary)
dictionary.update(dictionary.keys())
#print(dict(itertools.islice(dictionary.items(), 10)))


### Load test dataset

In [6]:
class test_text_Dataset(Dataset):

    def __init__(self, x):

        self.x = x
        self.num_data = len(x)

    def __len__(self):
        return self.num_data

    def __getitem__(self, idx):

        return (self.x[idx])

def get_text(input_path):

    input_data = json.load(open(input_path, 'r'))
    sentence = [d['sentence'] for d in input_data]

    return sentence

def load_test_data():

    test_path = './test.json'
    
    test_sentence = get_text(test_path)
    
    test_iter = test_text_Dataset(test_sentence)
    
    return test_iter, test_sentence

### Preprocess text data (tokenization and indexing for embedding)

In [7]:
label_pipeline = lambda y: int(y) 

def text_pipeline(text):
    
    max_length = 50     #max length of sentences in data
    sub_X = [0] * max_length

    for idx, word in enumerate(text.split()):
        if word in dictionary:
              
              if idx < max_length:
                  sub_X[idx] = dictionary[word]

    return sub_X


### Load test loader

In [8]:
def test_collate_batch(batch):

    text_list, len_sentence = [], []

    for _text in batch:
    
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         len_sen = len(_text.split())
         if(len_sen>50):
            len_sen = 50
         len_sentence.append(len_sen)

    len_sentence = torch.tensor(len_sentence)

    text_list= torch.stack(text_list,dim=0)
        
    return text_list.to(device), len_sentence.to(device)

test_iter, test_sentence = load_test_data()

test_dataloader = DataLoader(test_iter, batch_size=256, shuffle=False,collate_fn=test_collate_batch) 

#sample = next(iter(test_dataloader))
#print(sample)

### Attention module

In [9]:
class Attention(nn.Module):
    def __init__(self,hidden_size):              
        super(Attention, self).__init__()

        self.W_s1 = nn.Linear(2*hidden_size, 350)
        self.W_s2 = nn.Linear(350, 30)
        

    def forward(self, lstm_output, lenghts):
        
        max_len = lstm_output.size()[1]
        
        attn_weight_matrix = self.W_s2(F.tanh(self.W_s1(lstm_output)))
        attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1)
        attn_weight_matrix = F.softmax(attn_weight_matrix, dim=2)
        
        _ = 0

        mask = torch.ones(attn_weight_matrix.size(), requires_grad=True).cuda()
        #mask = torch.ones(attn_weight_matrix.size(), requires_grad=False).cuda()
        
        for i, l in enumerate(lenghts):  
            
             if l < max_len:
                 mask[i, l:] = 0
            
               
        
        attn_weight_matrix = attn_weight_matrix * mask

        return attn_weight_matrix, _

### LSTM model

In [10]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(RNN,self).__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim, padding_idx=0)
        self.embedding.weight=nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim,bidirectional=True,dropout=0.1)   

        self.atten1 = Attention(hidden_size=hidden_dim)     
        
        self.dropout = nn.Dropout(0.2)

        self.fc = torch.nn.Linear(12000, output_dim)
       
        

    def forward(self, text, len_sentence):

        embedded = self.embedding(text)
        embedded = self.dropout(embedded)

        length = len_sentence.cpu().to(dtype=torch.int64)

        embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True,enforce_sorted=False)
        output, (hidden, cell) = self.rnn(embedded)
        
        out, lengths = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)

        x, _ = self.atten1(out,lengths)
        hidden_matrix = torch.bmm(x, out)
        
        x = hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2])
        
        x = self.dropout(x)
       
        output = self.fc(x)

        return output

### Set model

In [11]:
num_class = 8
vocab_size = embedding_matrix.shape[0]
emsize = embedding_matrix.shape[1]
hidden_size = 2 * emsize 

model = RNN(input_dim=vocab_size,
             embedding_dim=emsize,
             hidden_dim=hidden_size,
             output_dim=num_class).to(device)


  self.embedding.weight=nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))


### Count parameters

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("the total number of parameters in Millions is:  ", count_parameters(model)/1000000)

the total number of parameters in Millions is:   2.067088


### Test and save results

In [13]:
### Load pth model
### predict label
### save predictions to file result

def predicting_labels(model, data_loader, device):

    predicted_labels_list = []
    
    model.eval()
    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (text, len_sentence) in enumerate(data_loader):

            text_ = text.to(device)
            
            logits = model(text_,len_sentence)
            _, predicted_labels = torch.max(logits, 1)

            predicted_labels_list.extend(predicted_labels.detach().cpu().numpy())

    return predicted_labels_list

def save_resulst(predicted_labels_list, test_sentence):

    results = list()

    for i in range(len(test_sentence)):
        tmp_result = dict()
        tmp_result['sentence'] = test_sentence[i]
        tmp_result['user_id'] = int(predicted_labels_list[i])
        results.append(tmp_result)

    json.dump(results, open('./result.json', 'w'), indent=2)

model.load_state_dict(torch.load('model.pth'))
model.eval()
predicted_labels_list = predicting_labels(model, test_dataloader, device)

save_resulst(predicted_labels_list, test_sentence)



