In [1]:
!pip install transformers
!pip install SpaCy tokenizer 
!pip install ftfy



In [0]:
import numpy as np
import pandas as pd
import transformers
from transformers import AdamW, XLMTokenizer, XLMModel, GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset
import torch
import re
from torch import nn
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
f = open("/content/drive/My Drive/rus.txt", "r")
english = []
russian = []
for line in f:
    words = line.split('\t')
    russian.append(words[1])
    english.append(words[0])

In [0]:
data = pd.DataFrame(russian, columns = ['russian'])
data['english'] = english

In [0]:
data = data[:20000]

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
class TranslateDataset(Dataset):
    def __init__(self, dataset, suftrain=True):
        self.df = dataset
        X_train, X_test = train_test_split(self.df, test_size=0.2, random_state=42)
        if suftrain == True:
          self.df = X_train.reset_index(drop=True)
        else:
          self.df = X_test.reset_index(drop=True)
        
        self.tokenizer_rus = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
        self.tokenizer_eng = GPT2Tokenizer.from_pretrained('gpt2')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        sentence_rus = self.df.loc[index, 'russian']
        sentence_eng = self.df.loc[index, 'english']
        tokens_ids_rus = self.tokenizer_rus.encode(sentence_rus)
        tokens_ids_eng = self.tokenizer_eng.encode(sentence_eng)
        tokens_ids_tensor_rus = torch.tensor(tokens_ids_rus)
        tokens_ids_tensor_eng = torch.tensor(tokens_ids_eng)
        
        return tokens_ids_tensor_rus, tokens_ids_tensor_eng

In [0]:
train_dataset = TranslateDataset(data, True)
test_dataset = TranslateDataset(data, False)

In [0]:
def pad_sequences(x, max_len, targ):
    if len(x) <max_len:
        if targ == False:
          padded = list(x.numpy()) + [2]*(max_len - len(x))
        else:
          padded = list(x.numpy()) + [50256]*(max_len - len(x))
    else:
       padded = list(x.numpy())[:(max_len)]
    return padded    


class Collator(object):
    def __init__(self, percentile=100):
        self.percentile = percentile
        
    def __call__(self, batch):

        
        #inp, targ, inp3, lens, lens_targ
        
        inp, targ = zip(*batch)
        
        lens = [len(x) for x in inp] 
        max_len = int(np.percentile(lens, self.percentile))
        
        lens_targ = [len(x) for x in targ]
        max_len_targ = int(np.percentile(lens_targ, self.percentile))

        inp = torch.from_numpy(np.array([pad_sequences(sentence, max_len, False) for sentence in inp]))
        targ = torch.from_numpy(np.array([pad_sequences(sentence, max_len, True) for sentence in targ]))
        
        
        return inp, targ

In [11]:
device

device(type='cuda')

In [0]:
#settings
batch_size = 100
epochs_dec = 15
learning_rate_dec = 0.00001


In [0]:
collate = Collator(percentile=100)

In [0]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=collate)

In [0]:
class ModelTranslate(nn.Module):
    def __init__(self):
        super(ModelTranslate, self).__init__()
        
        self.enc = XLMModel.from_pretrained("xlm-mlm-tlm-xnli15-1024")
        self.dec = GPT2LMHeadModel.from_pretrained('gpt2')
        self.fc = nn.Linear(1024, 768)
        
    def forward(self,seq, labels, suf= 'train'):

          outputs = self.enc(seq)[0]
          outputs = self.fc(outputs)
          self.dec.transformer.wte.parametres = outputs
          for block in range(7,11):
            self.dec.transformer.h[block].requires_grad_=True
          self.dec.lm_head.requires_grad_=True
          self.dec.resize_token_embeddings(95000)
          logits = self.dec(seq, labels = labels)[1]  
          if suf == 'test':
            for step in range(1,labels.size(1)):
              seq_tr = seq[:, :(step+1)]
              if step == 1:
                targets = labels[:,:(step+1)]
                logits = self.dec(seq_tr, labels = targets)[1]            
              if step > 1:
                targets = torch.cat([torch.argmax(logits, dim=-1), labels[:, step].view(-1,1)], dim = -1)
                logits = self.dec(seq_tr, labels = targets)[1]
          return logits

In [0]:
model =  ModelTranslate()

In [0]:
m = GPT2LMHeadModel.from_pretrained('gpt2')

In [18]:
m.resize_token_embeddings()

Embedding(50257, 768)

In [0]:
def loss_function(pred, real):
    """ Only consider non-zero inputs in the loss; mask needed """
    total_loss = 0
  
    for i in range(len(real[0])):
        
      mask = real[:,i].ge(1).cuda()
      crit = nn.CrossEntropyLoss()
      loss_ = crit(pred[:,i].cuda(), real[:,i].long()) * mask 
      total_loss  =  total_loss + torch.mean(loss_)
    return total_loss/len(real[0])

In [0]:
def evaluate(model, test_loader, device):
  model.eval()
  total_loss = 0
  for i, (seq_test, label_test) in enumerate(test_loader):
    seq_test, label_test = seq_test.to(device), label_test.to(device)
    
    with torch.no_grad():

      test_output = model(seq_test, label_test, 'test')
    
    test_loss = loss_function(test_output, label_test)
    total_loss = total_loss + test_loss
    return total_loss/(i+1)

In [0]:
def train(model, train_loader, test_loader, device, epochs_dec):
  model.to(device)
  model.train()
  
  optimizer = AdamW(model.parameters(), lr=learning_rate_dec)
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
  for epoch in range(epochs_dec):
    
    total_loss = 0
    for i , (seq, labels) in enumerate(train_loader):
      optimizer.zero_grad()
      
      seq, labels = seq.to(device), labels.to(device)

      logits = model(seq, labels)

      loss = loss_function(logits, labels)
      loss.backward()
      total_loss = total_loss + loss
     
      nn.utils.clip_grad_norm_(model.parameters(),0.5)

      optimizer.step()
      
    val_loss = evaluate(model, test_loader, device)
    print(f'Epoch {epoch}, Train_loss: {total_loss/(i+1)}, Val_loss: {val_loss}')
     
  return model

In [22]:
model = train(model, train_loader, test_loader, device, epochs_dec)

Epoch 0, Train_loss: 10.439287185668945, Val_loss: 1.6872849464416504
Epoch 1, Train_loss: 1.6729850769042969, Val_loss: 1.4561740159988403
Epoch 2, Train_loss: 1.5588891506195068, Val_loss: 1.3970799446105957
Epoch 3, Train_loss: 1.4921631813049316, Val_loss: 1.3572216033935547
Epoch 4, Train_loss: 1.437381625175476, Val_loss: 1.3161941766738892
Epoch 5, Train_loss: 1.4011157751083374, Val_loss: 1.2981739044189453
Epoch 6, Train_loss: 1.3514660596847534, Val_loss: 1.2747669219970703
Epoch 7, Train_loss: 1.312738060951233, Val_loss: 1.2599109411239624
Epoch 8, Train_loss: 1.2797749042510986, Val_loss: 1.242605447769165
Epoch 9, Train_loss: 1.2446924448013306, Val_loss: 1.2330267429351807
Epoch 10, Train_loss: 1.20681631565094, Val_loss: 1.2241095304489136
Epoch 11, Train_loss: 1.177462100982666, Val_loss: 1.2641935348510742
Epoch 12, Train_loss: 1.1502277851104736, Val_loss: 1.225440263748169
Epoch 13, Train_loss: 1.1105057001113892, Val_loss: 1.2147396802902222
Epoch 14, Train_loss: 1

In [23]:
test_dataset[0]

(tensor([    1,   435, 37372,  4300,    15,     1]),
 tensor([  40, 1101,  262, 4870,   13]))

In [0]:
for i, (seq_test, label_test) in enumerate(test_loader):
    seq_test, label_test = seq_test.to(device), label_test.to(device)
    
    with torch.no_grad():

      test_output = model(seq_test, label_test, 'test')
    if i>=1:
      break

In [0]:
tokenizer_eng = GPT2Tokenizer.from_pretrained('gpt2')

In [48]:
tokenizer_eng.decode(torch.argmax(test_output[18], dim=1))

'I crying.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [49]:
tokenizer_eng.decode(label_test[18])

'Stop yelling.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'