In [1]:
import torch
from torch import nn
import pandas as pd
from torch import optim
from torchtext import data,vocab
import random
import numpy as np

SEED = 1111

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
lang='tamil'

import pandas as pd

df_train = pd.read_csv("data1/"+lang+"/"+lang+"_train.tsv",sep="\t")
df_dev = pd.read_csv("data1/"+lang+"/"+lang+"_dev.tsv",sep="\t")
df_test = pd.read_csv("data1/"+lang+"/"+lang+"_test.tsv",sep="\t")

import pickle
file = open("data1/elmo_embeddings/elmo_train_"+lang+".pickle",'rb')
elmo_array = pickle.load(file)
file.close()
print(elmo_array.shape)

elmo_train = elmo_array[:len(df_train),:]
print(elmo_train.shape)
elmo_valid = elmo_array[len(df_train):len(df_train)+len(df_dev),:]
print(elmo_valid.shape)

elmo_train_valid = elmo_array[:len(df_train)+len(df_dev),:]
print(elmo_train_valid.shape)

elmo_test = elmo_array[len(df_train)+len(df_dev):,:]
print(elmo_test.shape)

print(df_test.head())

train_sentences = list(df_train['text'].values)
dev_sentences = list(df_dev['text'].values)

train_labels = list(df_train['category'].values)
dev_labels = list(df_dev['category'].values)

test_sentences = list(df_test['text'].values)


total_sentences = []
total_sentences.extend(train_sentences)
total_sentences.extend(dev_sentences)
total_sentences.extend(test_sentences)

total_labels = []
total_labels.extend(train_labels)
total_labels.extend(dev_labels)

len(total_sentences)

fout = open(lang+"_total_input_sentences.txt","w+",encoding='utf-8')

for line in total_sentences:
    fout.write(line+"\n")
fout.close()

(15744, 1024)
(11335, 1024)
(1260, 1024)
(12595, 1024)
(3149, 1024)
          id                                               text  \
0  ta_sent_1         Yarayellam FDFS ppga ippove ready agitinga   
1  ta_sent_2  Ennada viswasam mersal sarkar madhri time la l...   
2  ta_sent_3  yuvan vera level ya .... valuable script. SK i...   
3  ta_sent_4  70 vayasulayum thanoda rasigargala sandhosapad...   
4  ta_sent_5      all the best anna...Telugu makkal selvan fans   

         category  
0        Positive  
1        Positive  
2        Positive  
3  Mixed_feelings  
4        Positive  


In [3]:
import sentencepiece as spm
spm.SentencePieceTrainer.train(input=lang+"_total_input_sentences.txt", model_prefix='t_m')

sp = spm.SentencePieceProcessor(model_file='t_m.model')

tokenized_file = sp.encode(train_sentences, out_type=str)

valid_tokenized_file = sp.encode(dev_sentences,out_type=str)

test_tokenized_file = sp.encode(test_sentences,out_type=str)

def join_token_to_line(token_file):
    new_sentence_tokenized_file = []
    
    for line in token_file:
        new_sentence_tokenized_file.append(" ".join(line))
        
    return new_sentence_tokenized_file

new_tokenized_train = join_token_to_line(tokenized_file)
new_tokenized_valid = join_token_to_line(valid_tokenized_file)
new_tokenized_test = join_token_to_line(test_tokenized_file)

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf = True,
                             use_idf = True)

train_vectors = vectorizer.fit_transform(new_tokenized_train)
valid_vectors = vectorizer.transform(new_tokenized_valid)
test_vectors = vectorizer.transform(new_tokenized_test)

train_vectors = np.array(train_vectors.todense())
valid_vectors = np.array(valid_vectors.todense())
test_vectors = np.array(test_vectors.todense())

train_elmo_tf_idf = np.concatenate((elmo_train,train_vectors),axis=1)
valid_elmo_tf_idf = np.concatenate((elmo_valid,valid_vectors),axis=1)
test_elmo_tf_idf = np.concatenate((elmo_test,test_vectors),axis=1)

concatenated_column_size = train_elmo_tf_idf.shape[1]

print(train_elmo_tf_idf.shape)

(11335, 6398)


In [5]:
t_v_columns = ['text','label']

df_train1 = pd.DataFrame(zip(new_tokenized_train,train_labels),columns=t_v_columns)
df_valid1 = pd.DataFrame(zip(new_tokenized_valid,dev_labels),columns=t_v_columns)

df_test1 = pd.DataFrame(new_tokenized_test,columns=['text'])

df_train1.to_csv(lang+"_df_train1.tsv",sep="\t")
df_valid1.to_csv(lang+"_df_valid1.tsv",sep="\t")
df_test1.to_csv(lang+"_df_test1.tsv",sep="\t")

In [6]:
print(len(df_train1),len(df_valid1),len(df_test1))

11335 1260 3149


In [7]:
tokenize = lambda x: x.split()
TEXT= data.Field(sequential=True,tokenize=tokenize,use_vocab=True,batch_first=True,init_token='<sos>', eos_token='<eos>')
LABEL = data.LabelField()
INDEX = data.LabelField()
train_valid_fields = [('index',INDEX),('text',TEXT),('label',LABEL)]
test_field = [('index',INDEX),('text',TEXT)]

train_data = data.TabularDataset(lang+"_df_train1.tsv",format="tsv",skip_header=True,fields=train_valid_fields)
valid_data = data.TabularDataset(lang+"_df_valid1.tsv",format='tsv',skip_header=True,fields=train_valid_fields)
test_data = data.TabularDataset(lang+"_df_test1.tsv",format='tsv',skip_header=True,fields=test_field)

#print(vars(train_data.examples[len(df_train)-1]))

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device,sort=False)

INDEX.build_vocab(train_data,valid_data,test_data)
TEXT.build_vocab(train_data,valid_data,test_data)
LABEL.build_vocab(train_data,valid_data)

from gensim.models import FastText
model_ft = FastText.load("fast_text_models/"+lang+"_300/model_ft_15")

W2V_SIZE=300
from tqdm.notebook import tqdm
word2vec_vectors = []
for token, idx in tqdm(TEXT.vocab.stoi.items()):
    if token in model_ft.wv.vocab.keys():
        word2vec_vectors.append(torch.FloatTensor(model_ft[token]))
    else:
        print(token)
        word2vec_vectors.append(torch.zeros(W2V_SIZE))
TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, W2V_SIZE)

Number of training examples: 11335
Number of validation examples: 1260
Number of testing examples: 3149


HBox(children=(FloatProgress(value=0.0, max=8211.0), HTML(value='')))

<unk>
<pad>
<sos>
<eos>





In [8]:
class Encoder(nn.Module):
    def __init__(self,src_vocab_size,output_dim,embedding_dim,hidden_dim,n_layers,n_heads,pf_dim,dropout,device,bidirectional=True,max_length=100):
        super().__init__()
        self.device = device
        self.tok_embed_part1 = nn.Embedding(src_vocab_size,embedding_dim)
        self.tok_embed_part2 = nn.Linear(embedding_dim,hidden_dim)
        self.pos_embed = nn.Embedding(max_length,hidden_dim)
        self.layers = nn.ModuleList([EncoderLayer(hidden_dim,n_heads,pf_dim,dropout,device) for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)
        
        self.rnn = nn.GRU(hidden_dim,
                          hidden_dim,
                          num_layers = 2,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out1 = nn.Linear(hidden_dim * 2 + concatenated_column_size,output_dim)
    
    def forward(self,src,src_mask,elmo_embed):
        batch_size = src.shape[0]
        seq_length = src.shape[1]
        position = torch.arange(0,seq_length).unsqueeze(0).repeat(batch_size,1).to(device)
        src = self.tok_embed_part1(src)
        embedded = self.dropout(self.tok_embed_part2(src) * self.scale +self.pos_embed(position))
        
        for layer in self.layers:
            encoded = layer(embedded,src_mask)
        
        _, hidden = self.rnn(encoded)
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        
        hidden_with_elmo = torch.cat((hidden,elmo_embed),dim=1)
        
        #print(hidden_with_elmo.shape)
                
        output = self.out1(hidden_with_elmo)
            
        return output

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self,hidden_dim,n_heads,pf_dim,dropout,device):
        super().__init__()
        self.attention_norm = nn.LayerNorm(hidden_dim)
        self.pf_norm = nn.LayerNorm(hidden_dim)
        self.attention = MultiHeadedAttention(hidden_dim,n_heads,dropout,device)
        self.pf = PositionwiseFeedForward(hidden_dim,pf_dim,dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self,src,src_mask):
        _src,_ = self.attention(src,src,src,src_mask)
        src = self.attention_norm(src+self.dropout(_src))
        _src = self.pf(src)
        src = self.pf_norm(src+self.dropout(_src))
        return src

In [10]:
class MultiHeadedAttention(nn.Module):
    def __init__(self,hidden_dim,n_heads,dropout,device):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_heads = n_heads
        self.head_dim = hidden_dim//n_heads
        
        self.fc_q = nn.Linear(hidden_dim,hidden_dim)
        self.fc_k = nn.Linear(hidden_dim,hidden_dim)
        self.fc_v = nn.Linear(hidden_dim,hidden_dim)
        
        self.fc_o = nn.Linear(hidden_dim,hidden_dim)
        
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.hidden_dim])).to(device)
        
    def forward(self,query,key,value,mask=None):
        batch_size = query.shape[0]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2))
        energy = energy/self.scale
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e5)
        
        attention = torch.softmax(energy, dim = -1)
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hidden_dim)
        x = self.fc_o(x)
       
        return x, attention

In [11]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self,hidden_dim,pf_dim,dropout):
        super().__init__()
        self.pf1 = nn.Linear(hidden_dim,pf_dim)
        self.pf2 = nn.Linear(pf_dim,hidden_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x):
        x = self.dropout(torch.relu(self.pf1(x)))
        x = self.pf2(x)
        return x

In [12]:
class seq2seq(nn.Module):
    def __init__(self,encoder,src_pad_idx,device):
        super().__init__()
        self.encoder = encoder
        self.src_pad_idx = src_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def forward(self,src,elmo_embed):
        src_mask = self.make_src_mask(src)
        enc_src = self.encoder(src, src_mask,elmo_embed)
        return enc_src

In [13]:
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = 5
EMBEDDING_DIM = TEXT.vocab.vectors.shape[1]
HID_DIM = 256
ENC_LAYERS = 1
ENC_HEADS = 8
ENC_PF_DIM = 512
ENC_DROPOUT = 0.1
BIDIRECIONAL = True

BOS_TAG_ID = TEXT.vocab.stoi['<sos>']
EOS_TAG_ID = TEXT.vocab.stoi['<eos>']
PAD_TAG_ID = TEXT.vocab.stoi['<pad>']


encoder = Encoder(INPUT_DIM,OUTPUT_DIM,EMBEDDING_DIM,HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT,
              device)

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = seq2seq(encoder, PAD_IDX, device).to(device)

print(model)

LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

seq2seq(
  (encoder): Encoder(
    (tok_embed_part1): Embedding(8211, 300)
    (tok_embed_part2): Linear(in_features=300, out_features=256, bias=True)
    (pos_embed): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (attention_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (pf_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attention): MultiHeadedAttention(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (pf): PositionwiseFeedForward(
          (pf1): Linear(in_features=256, out_features=512, bias=True)
          (pf2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inpla

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

The model has 5,099,839 trainable parameters


In [15]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [16]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    epoch_acc= 0.0
    
    for i, batch in enumerate(iterator):
        text = batch.text
        label = batch.label
        index_array = batch.index
        
        batch_elmo_vector = []
        
        for index_value in index_array:
            batch_elmo_vector.append(train_elmo_tf_idf[index_value])
        
        batch_elmo_vector = torch.Tensor(batch_elmo_vector).to(device)
        
        optimizer.zero_grad()
        
        output = model(text,batch_elmo_vector)
        
        loss = criterion(output, label)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        acc = categorical_accuracy(output,label)
        
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc /len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    epoch_acc= 0.0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            text = batch.text
            label = batch.label
            
            index_array = batch.index
        
            batch_elmo_vector = []

            for index_value in index_array:
                batch_elmo_vector.append(valid_elmo_tf_idf[index_value])

            batch_elmo_vector = torch.Tensor(batch_elmo_vector).to(device)

            output = model(text,batch_elmo_vector)

            loss = criterion(output, label)

            epoch_loss += loss.item()

            acc = categorical_accuracy(output,label)

            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc /len(iterator)

In [18]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
CLIP = 1
epochs = 15

best_valid_loss = float('inf')

for epoch in range(epochs):
    
    start_time = time.time()
    
    train_loss,train_acc = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss,valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model,'model_'+lang+'.pt')
        print("model saved")

Epoch: 01 | Epoch Time: 0m 16s
	Train Loss: 0.747 | Train Acc: 67.31%
	 Val. Loss: 0.658 |  Val. Acc: 68.78%
model saved


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Epoch: 02 | Epoch Time: 0m 16s
	Train Loss: 0.679 | Train Acc: 68.10%
	 Val. Loss: 0.650 |  Val. Acc: 68.78%
model saved
Epoch: 03 | Epoch Time: 0m 16s
	Train Loss: 0.642 | Train Acc: 68.59%
	 Val. Loss: 0.666 |  Val. Acc: 68.31%
Epoch: 04 | Epoch Time: 0m 15s
	Train Loss: 0.595 | Train Acc: 69.12%
	 Val. Loss: 0.661 |  Val. Acc: 68.70%
Epoch: 05 | Epoch Time: 0m 15s
	Train Loss: 0.555 | Train Acc: 70.06%
	 Val. Loss: 0.688 |  Val. Acc: 68.46%
Epoch: 06 | Epoch Time: 0m 15s
	Train Loss: 0.510 | Train Acc: 71.43%
	 Val. Loss: 0.650 |  Val. Acc: 67.21%
Epoch: 07 | Epoch Time: 0m 15s
	Train Loss: 0.450 | Train Acc: 73.02%
	 Val. Loss: 0.682 |  Val. Acc: 65.31%
Epoch: 08 | Epoch Time: 0m 15s
	Train Loss: 0.401 | Train Acc: 74.63%
	 Val. Loss: 0.744 |  Val. Acc: 64.74%
Epoch: 09 | Epoch Time: 0m 15s
	Train Loss: 0.336 | Train Acc: 76.59%
	 Val. Loss: 0.832 |  Val. Acc: 62.16%
Epoch: 10 | Epoch Time: 0m 16s
	Train Loss: 0.285 | Train Acc: 78.41%
	 Val. Loss: 0.864 |  Val. Acc: 64.71%
Epoch: 

In [20]:
import gc
del model
gc.collect()

9

In [21]:
from sklearn.metrics import accuracy_score,classification_report,f1_score

def metrics_test(y_pred, y):
    accuracy = accuracy_score(y_pred,y)
    print("accuracy score is: ",accuracy)
    print("weighted_f1 is: ",f1_score(y_pred,y,average='weighted'))
    print("classification metric")
    print(classification_report(y_pred,y))
    
    return accuracy

def return_y_predictions(predictions):
    max_preds = predictions.argmax(dim = 1, keepdim = True)
    max_preds = max_preds.squeeze().cpu()
    #print(max_preds.shape)
    return max_preds

def evaluate_f1_accuracy_valid(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    epoch_acc= 0.0
    
    y_preds_list=[]
    y_true_list=[]
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            text = batch.text
            label = batch.label
            
            index_array = batch.index
        
            batch_elmo_vector = []

            for index_value in index_array:
                batch_elmo_vector.append(valid_elmo_tf_idf[index_value])

            batch_elmo_vector = torch.Tensor(batch_elmo_vector).to(device)

            output = model(text,batch_elmo_vector)

            loss = criterion(output, label)

            epoch_loss += loss.item()
            
            for true_value in label.squeeze().cpu():
                y_true_list.append(true_value.item())
            
            max_preds_tensor = return_y_predictions(output)
            for max_value in max_preds_tensor:
                y_preds_list.append(max_value.item())
            
        
    acc = metrics_test(y_preds_list,y_true_list)    
        
    return epoch_loss/len(iterator),y_preds_list

In [22]:
print(LABEL.vocab.stoi)
model = torch.load('model_'+lang+'.pt')
model = model.to(device)
valid_loss,y_preds_list = evaluate_f1_accuracy_valid(model, valid_iterator, criterion)
print(f'\t Val. Loss: {valid_loss:.3f}')

from collections import Counter
print(Counter(list(df_valid1['label'].values)))
print(Counter(y_preds_list))

defaultdict(None, {'Positive ': 0, 'Negative ': 1, 'Mixed_feelings ': 2, 'unknown_state ': 3, 'not-Tamil ': 4})
accuracy score is:  0.6880952380952381
weighted_f1 is:  0.8072992396997715
classification metric
              precision    recall  f1-score   support

           0       1.00      0.69      0.82      1235
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.03      0.40      0.05         5
           4       0.41      0.60      0.49        20

   micro avg       0.69      0.69      0.69      1260
   macro avg       0.29      0.34      0.27      1260
weighted avg       0.98      0.69      0.81      1260

	 Val. Loss: 0.650
Counter({'Positive ': 857, 'Negative ': 165, 'Mixed_feelings ': 141, 'unknown_state ': 68, 'not-Tamil ': 29})
Counter({0: 1235, 4: 20, 3: 5})


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [23]:
def evaluate_test(model, iterator):
    
    model.eval()
    
    y_preds_list=[]
    index_list = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            text = batch.text
            #label = batch.label
            
            index_array = batch.index
            #print(index_array)
        
            batch_elmo_vector = []

            for index_value in index_array:
                batch_elmo_vector.append(test_elmo_tf_idf[index_value])

            batch_elmo_vector = torch.Tensor(batch_elmo_vector).to(device)

            output = model(text,batch_elmo_vector)
            
            max_preds_tensor = return_y_predictions(output)
            
            for index_value in index_array:
                index_list.append(index_value.cpu().item())
            
            for max_value in max_preds_tensor:
                y_preds_list.append(max_value.item())
        
    return index_list,y_preds_list

In [24]:
from collections import OrderedDict
index_list,y_preds_list = evaluate_test(model, test_iterator)

dict_index_y_value = {}

for index,y_value in zip(index_list,y_preds_list):
    dict_index_y_value[index] = LABEL.vocab.itos[y_value]

od = OrderedDict(sorted(dict_index_y_value.items()))

In [25]:
sorted_y_predicts = []
for k,v in od.items():
    sorted_y_predicts.append(v)

In [26]:
test_sentences = list(df_test['text'].values)
test_id = list(df_test['id'].values)
test_final = pd.DataFrame(zip(test_id,test_sentences,sorted_y_predicts),columns=['id','text','label'])

In [27]:
test_final.to_csv("CMSA11-One_"+lang+".tsv",sep="\t",index=False)