In [1]:
!pip install transformers==2.5.0
!pip install SpaCy tokenizer 
!pip install ftfy



In [0]:
import numpy as np
import pandas as pd
import transformers
from transformers import AdamW, XLMTokenizer, XLMModel, GPT2Tokenizer, GPT2LMHeadModel, XLMWithLMHeadModel
from torch.utils.data import Dataset
import torch
import re
from torch import nn
from sklearn.model_selection import train_test_split
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
f = open("/content/drive/My Drive/rus.txt", "r")
english = []
russian = []
for line in f:
    words = line.split('\t')
    russian.append(words[1])
    english.append(words[0])

In [0]:
data = pd.DataFrame(russian, columns = ['russian'])
data['english'] = english

In [0]:
pattern = re.compile('[A-Za-zА-Яа-яЁёЙй?-]|\s')
data['russian'] = [''.join(re.findall(pattern, x)) for x in data['russian']]
data['english'] = [''.join(re.findall(pattern, x)) for x in data['english']]
data['russian'] = data['russian'] + [' <to_eng>']*len(data)
data['lang'] = data['russian'] + ' '+ data['english']

In [0]:
data = data[:50000]

In [8]:
data

Unnamed: 0,russian,english,lang
0,Марш <to_eng>,Go,Марш <to_eng> Go
1,Иди <to_eng>,Go,Иди <to_eng> Go
2,Идите <to_eng>,Go,Идите <to_eng> Go
3,Здравствуйте <to_eng>,Hi,Здравствуйте <to_eng> Hi
4,Привет <to_eng>,Hi,Привет <to_eng> Hi
...,...,...,...
49995,У меня есть работа которую нужно сделать <to_eng>,I have a job to do,У меня есть работа которую нужно сделать <to_e...
49996,Мне ещё нужно сделать работу <to_eng>,I have a job to do,Мне ещё нужно сделать работу <to_eng> I have a...
49997,Мне ещё нужно делать работу <to_eng>,I have a job to do,Мне ещё нужно делать работу <to_eng> I have a ...
49998,У меня тут список <to_eng>,I have a list here,У меня тут список <to_eng> I have a list here


In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
class TranslateDataset(Dataset):
    def __init__(self, dataset, suftrain=True):
        self.df = dataset
        self.suftrain = suftrain
        X_train, X_test = train_test_split(self.df, test_size=0.2, random_state=42)
        if suftrain == True:
          self.df = X_train[['russian','english']].reset_index(drop=True)
        else:
          self.df = X_test[['russian','english']].reset_index(drop=True)
        
        self.tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
        self.tokenizer.add_tokens(['<to_eng>'])
        # self.tokenizer_eng = GPT2Tokenizer.from_pretrained('gpt2')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
    
          sentence_lang = self.df.loc[index, 'english']
          sentence_rus = self.df.loc[index, 'russian']
          tokens_ids_lang = self.tokenizer.encode(sentence_lang)
          tokens_ids_rus = self.tokenizer.encode(sentence_rus)
          tokens_ids_tensor_lang = torch.tensor(tokens_ids_lang)
          tokens_ids_tensor_rus = torch.tensor(tokens_ids_rus)

          return tokens_ids_tensor_rus, tokens_ids_tensor_lang[1:]

In [0]:
train_dataset = TranslateDataset(data, True)
test_dataset = TranslateDataset(data, False)

In [0]:
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")

In [13]:
train_dataset[1]

(tensor([    1,  2248,  1572, 12794, 26079,   457,   998, 95000,     1]),
 tensor([ 2323, 42118, 53124,     1]))

In [14]:
tokenizer.convert_ids_to_tokens(range(15))

['<s>',
 '</s>',
 '<pad>',
 '<unk>',
 '<special0>',
 '<special1>',
 '<special2>',
 '<special3>',
 '<special4>',
 '<special5>',
 '<special6>',
 '<special7>',
 '<special8>',
 '<special9>',
 ',</w>']

In [0]:
def pad_sequences(x, max_len):
    x = x[:-1]
    if len(x) <max_len:
        # if targ == False:
        padded = list(x.numpy()) + [2]*(max_len - len(x))
        # else:
        #   padded = list(x.numpy()) + [50256]*(max_len - len(x))
    else:
       padded = list(x.numpy())[:(max_len)]
    return padded    


class Collator(object):
    def __init__(self, percentile=100):
        self.percentile = percentile
        
    def __call__(self, batch):
        
        inp, targ = zip(*batch)
        
        lens = [len(x) for x in inp] 
        max_len = int(np.percentile(lens, self.percentile))
        
        lens_targ = [len(x) for x in targ]
        max_len_targ = int(np.percentile(lens_targ, self.percentile))

        inp = torch.from_numpy(np.array([pad_sequences(sentence, max_len) for sentence in inp]))
        targ = torch.from_numpy(np.array([pad_sequences(sentence, max_len) for sentence in targ]))
        
        
        return inp, targ


In [16]:
device

device(type='cuda')

In [0]:
#settings
batch_size = 64
#epoch_dec = 10
epochs_dec = 3
# learning_rate_dec = 0.0008
learning_rate_dec = 0.0005


In [0]:
collate = Collator(percentile=100)

In [0]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=collate)

In [20]:
tokenizer.vocab_size

95000

In [0]:
class ModelTranslate(nn.Module):
    def __init__(self):
        super(ModelTranslate, self).__init__()
        
        self.enc = XLMModel.from_pretrained("xlm-mlm-tlm-xnli15-1024")
        # self.dec = XLMWithLMHeadModel.from_pretrained("xlm-mlm-tlm-xnli15-1024")
        self.dec = GPT2LMHeadModel.from_pretrained("gpt2")
        self.fc = nn.Linear(1024, 768)
        # self.fc1 = nn.Linear(1024, 95001)

    def forward(self,seq, labels, suf= 'train'):

          self.enc.requires_grad_=False
          outputs = self.enc(seq)[0]
          outputs = self.fc(outputs)
          self.dec.resize_token_embeddings(95001)
          self.dec.transformer.wte.parametres = outputs
          # for block in range(7,11):
          #   self.dec.transformer.h[block].requires_grad_=True
          # self.dec.lm_head.requires_grad_=True
          self.dec.requires_grad_=True
          self.dec.transformer.wte.parametres.requires_grad_=False
          # self.dec.pred_layer.requires_grad_=True
          # self.dec.transformer.layer_norm_emb.requires_grad_=True
          attention_mask = (seq > 4)*1
          # inputs_embeds = outputs
          
          logits = self.dec(seq, labels = seq, attention_mask=attention_mask)[1]
          # logits = self.dec(seq, labels=labels,attention_mask=attention_mask)[1]
       
          
          if suf == 'test':
            logits = self.dec(seq, attention_mask=attention_mask)[0]
          
            # for step in range(1,labels.size(1)):
            #   seq_tr = seq[:, :(step+1)]
            #   attention_mask = (seq_tr == 1)*1
            #   if step == 0:
            #     targets = labels[:,:(step+1)]
             
            #     logits = self.dec(seq_tr, labels = targets, attention_mask=attention_mask)[1]            
            #   if step > 0:
                
            #     targets = torch.cat([torch.argmax(logits, dim=-1), labels[:, step].view(-1,1)], dim = -1)
            #     logits = self.dec(seq_tr, labels = targets, attention_mask=attention_mask)[1]
         
          return logits

In [0]:
model =  ModelTranslate()

In [0]:
def loss_function(pred, real):
    """ Only consider non-zero inputs in the loss; mask needed """
    total_loss = 0
    crit = nn.CrossEntropyLoss()
    
    for i in range(len(real[0])):
      mask = real[:,i].ge(4).cuda()
      loss_ = crit(pred[:,i].cuda(), real[:,i].long())*mask
      total_loss  =  total_loss + torch.mean(loss_)
    return total_loss/len(real[0])

In [0]:
def evaluate(model, test_loader, device):
  model.eval()
  total_loss = 0
  for i, (seq_test, label_test) in enumerate(test_loader):
    seq_test, label_test = seq_test.to(device), label_test.to(device)
    
    with torch.no_grad():

      test_output = model(seq_test, label_test, 'test')
    
    test_loss = loss_function(test_output, label_test)
    total_loss = total_loss + test_loss
    return total_loss/(i+1)

In [0]:
def train(model, train_loader, test_loader, device, epochs_dec):
  model.to(device)
  model.train()
  
  optimizer = AdamW(model.parameters(), lr=learning_rate_dec)
  for epoch in range(epochs_dec):
    
    total_loss = 0
    for i , (seq, labels) in enumerate(train_loader):
      optimizer.zero_grad()
      
      seq, labels = seq.to(device), labels.to(device)

      logits = model(seq, labels)
     

      loss = loss_function(logits, labels)
      loss.backward()
      total_loss = total_loss + loss
     
      nn.utils.clip_grad_norm_(model.parameters(),1)
      
      optimizer.step()
      
      
    val_loss = evaluate(model, test_loader, device)
    print(f'Epoch {epoch}, Train_loss: {total_loss/(i+1)}, Val_loss: {val_loss}')
     
  return model

In [0]:
#firts 7 epochs
# Epoch 0, Train_loss: 2.0162532329559326, Val_loss: 1.8534671068191528
# Epoch 1, Train_loss: 1.5938363075256348, Val_loss: 1.6540483236312866
# Epoch 2, Train_loss: 1.4494097232818604, Val_loss: 1.5349314212799072
# Epoch 3, Train_loss: 1.3332741260528564, Val_loss: 1.439836859703064
# Epoch 4, Train_loss: 1.2357085943222046, Val_loss: 1.421500325202942
# Epoch 5, Train_loss: 1.1564744710922241, Val_loss: 1.4078181982040405
# Epoch 6, Train_loss: 1.0948387384414673, Val_loss: 1.433040976524353

# Epoch 0, Train_loss: 0.8229318857192993, Val_loss: 1.0416773557662964
# Epoch 1, Train_loss: 0.6625553965568542, Val_loss: 1.025605320930481
# Epoch 2, Train_loss: 0.6321516633033752, Val_loss: 1.0261223316192627

In [43]:
model = train(model, train_loader, test_loader, device, epochs_dec)

Epoch 0, Train_loss: 0.7088356614112854, Val_loss: 1.0548229217529297
Epoch 1, Train_loss: 0.6433326005935669, Val_loss: 0.9938598275184631
Epoch 2, Train_loss: 0.6314703822135925, Val_loss: 1.007136583328247


In [27]:
test_dataset[0]

(tensor([    1,   673, 41287,   906, 95000,     1]),
 tensor([ 6723,  1739, 51587,   202,     1]))

In [0]:
model.eval()

for i, (seq_test, label_test) in enumerate(test_loader):
    seq_test, label_test = seq_test.to(device), label_test.to(device)
    
    with torch.no_grad():

      test_output = model(seq_test, label_test, 'test')
    if i>=10:
      break

In [29]:
tokenizer_eng = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
tokenizer_eng.add_tokens(['<to_eng>'])

1

In [47]:
for i in range(64):
  print('predict:', tokenizer_eng.decode(torch.argmax(test_output[i], dim=1)), ' true:', tokenizer_eng.decode(label_test[i]))

predict: take this one <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>  true: take that one <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
predict: tom is still sick <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>  true: tom is still up <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
predict: tom is thai <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>  true: tom gloated <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
predict: did was hear me me <pad><pad>me me am am anyone me <pad>  true: can anyone hear me? <pad><pad><pad><pad><pad><pad><pad><pad><pad>
predict: it is be <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>  true: that was pathetic <pad><pad><pad><pad><pad><pad><pad><pad><pad>
predict: what did you hide? <pad><pad><pad><pad><pad><pad><pad><pad><pad>  true: what did you hide? <pad><pad><pad><pad><pad><pad><pad><pad><pad>
predict: i was with <pad><pad><pad><pad><pad><pad><pad><pad>i <pad><pad>  true: i agree with tom <pad><pad><pa

In [31]:
tokenizer_eng.decode(label_test[45])

'tom isnt old <pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [32]:
tokenizer_eng.decode(torch.argmax(test_output[45], dim=1))

'tom is ont old <pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [33]:
tokenizer.encode([','])

[1, 24406, 1]