In [114]:
!pip install transformers
!pip install tensorboardX
import glob
import logging
import math
import os
import random
import numpy as np
import pandas as pd
import torch
from scipy.stats import mode
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.auto import tqdm, trange
logger = logging.getLogger(__name__)
from scipy.stats import pearsonr, spearmanr
from sklearn import preprocessing
import csv
from multiprocessing import cpu_count
from google.colab import drive
drive.mount('/content/drive')
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch import nn
from torch.nn import MSELoss
import torch
import math
import itertools

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
class config():
    def __init__(self):
        self.num_heads = 16
        self.pad_index = 2
        self.num_layers = 12
        self.unk_index = 3
        self.vocab_size = 30145
        self.pad_token = 2
        self.norm_eps = 1e-12
        self.init_std = 0.03
        self.emb_dim = 2048
        self.eos_index = 1
        self.max_pos_em = 512
        self.num_words = 30145
configuration = config()

In [None]:
class MultiHeadAtt(nn.Module):
    NEW_ID = itertools.count()
    def __init__(self, heads, dim, attention_dropout):
        super().__init__()
        self.layer_id = next(MultiHeadAtt.NEW_ID)
        self.dim = dim
        self.heads = heads
        self.dropout = attention_dropout
        self.q_lin = nn.Linear(dim, dim)
        self.k_lin = nn.Linear(dim, dim)
        self.v_lin = nn.Linear(dim, dim)
        self.out_lin = nn.Linear(dim, dim)
    def forward(self, input, mask, kv=None, cache=None, head_mask=None, output_attentions=False):
        def shape(x):
            return x.view(batch_size, -1, self.heads, dim_per_head).transpose(1, 2)
        def unshape(x):
            return x.transpose(1, 2).contiguous().view(batch_size, -1, self.heads * dim_per_head)
        batch_size, q_len, dim = input.size()
        if kv is None:
            k_len = q_len
        else:
            k_len = kv.size(1)
        n_heads = self.n_heads
        dim_per_head = self.dim // n_heads
        mask_reshape = (batch_size, 1, q_len, k_len) if mask.dim() == 3 else (batch_size, 1, 1, k_len)
        q = shape(self.q_lin(input)) 
        if kv is None:
            k = shape(self.k_lin(input)) 
            v = shape(self.v_lin(input)) 
        elif self.layer_id not in cache:
            k = v = kv
            k = shape(self.k_lin(k)) 
            v = shape(self.v_lin(v)) 
        sqrt = np.sqrt(self.heads)
        q = q / sqrt
        k_t = k.transpose(2,3)
        scores = torch.matmul(q, k_t) 
        mask = (mask == 0).reshape(mask_reshape).expand_as(scores) 
        scores.masked_fill_(mask, -1 * np.inf)
        weights = nn.functional.softmax(scores.float(), dim=-1)
        weights = weights.type_as(scores).dropout(weights, p=self.dropout)
        context = torch.matmul(weights, v).transpose(1, 2).contiguous().view(batch_size, -1, heads * dim_per_head)
        outputs = (self.out_lin(context),)
        if output_attentions:
            outputs = outputs + (weights,)
        return outputs

In [None]:
class FeedForward(nn.Module):
    def __init__(self, in_dim, dim_hidden, out_dim, chunk_size_feed_forward,dropout):
        super().__init__()
        self.dropout = dropout
        self.lin1 = nn.Linear(in_dim, dim_hidden)
        self.lin2 = nn.Linear(dim_hidden, out_dim)
        self.act = nn.functional.relu
        self.size_feed_forward = chunk_size_feed_forward
        self.seq_len_dim = 1

    def forward(self, input):
        tensor_shape = input[0].shape[self.chunk_size_feed_forward]
        num_chunks = input[0].shape[self.seq_len_dim]
        num_chunks = num_chunks // self.size_feed_forward
        input_tensors_chunks = tuple(input.chunk(num_chunks, dim=self.seq_len_dim) for input_tensor in input)
        output_chunks = tuple(self.ff_chunk(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
        return torch.cat(output_chunks,dim=self.seq_len_dim)

    def ff_chunk(self, input):
        x = self.lin1(input)
        x = self.act(x)
        x = self.lin2(x)
        x = nn.functional.dropout(x, p=self.dropout)
        return x

In [None]:
class XLMPreTrainedModel():
    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)
    def init_weights(self, module):
        if isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        if isinstance(module, nn.Linear):
            if self.config is not None and self.config.init_std is not None:
                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0.0)
        if isinstance(module, nn.LayerNorm):
            module.weight.data.fill_(1.0)
            module.bias.data.zero_()
            

In [None]:
def get_masks(slen, lengths, causal, padding_mask=None):
    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
    mask = padding_mask
    bs = lengths.size(0)
    attn_mask = mask
    return mask, attn_mask

In [None]:
class XLMModel():
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.n_langs = 2
        self.n_words = 30145
        self.eos_index = 1
        self.pad_index = 2
        self.dim = 2048  
        self.hidden_dim = 2048 * 4
        self.n_heads = 16
        self.dropout = 0.02
        self.attention_dropout = 0.02
        self.max_pos_em = nn.Embedding(512,2048)
        self.lang_embeddings = nn.Embedding(2,2048)
        self.embeddings = nn.Embedding(self.n_words, 2048, padding_idx=2)
        self.layer_norm_emb = nn.LayerNorm(2048, eps=1e-12)
        self.attentions = nn.ModuleList()
        self.layer_norm1 = nn.ModuleList()
        self.ffns = nn.ModuleList()
        self.layer_norm2 = nn.ModuleList()
        for _ in range(12):
            self.attentions.append(MultiHeadAtt(self.n_heads, self.dim, config=config))
            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.norm_eps))
            self.ffns.append(FeedForward(self.dim, self.hidden_dim, self.dim, config=config))
            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.norm_eps))
        self.post_init()
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
    def get_input_embeddings(self):
        return self.embeddings
    def set_input_embeddings(self, new_embeddings):
        self.embeddings = new_embeddings
    def forward(self,input_ids,attention_mask,langs,token_type_ids,position_ids,lens,cache,head_mask,inputs_embeds,output_attentions,output_hidden_states,return_dict,):
        batch_size, sentence_length = input_ids.size()
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        mask, attn_mask = get_masks(sentence_length, lens, True, padding_mask=attention_mask)
        head_mask = self.get_head_mask(head_mask, 12)
        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
        tensor = self.layer_norm_emb(tensor)
        tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training)
        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
        hidden_states = () if output_hidden_states else None
        attentions = () if output_attentions else None
        for i in range(12):
            if output_hidden_states:
                hidden_states = hidden_states + (tensor,)
            attn_outputs = self.attentions[i](
                tensor,
                attn_mask,
                cache=cache,
                head_mask=head_mask[i],
                output_attentions=output_attentions,
            )
            attn = attn_outputs[0]
            if output_attentions:
                attentions = attentions + (attn_outputs[1],)
            attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
            tensor = tensor + attn
            tensor = self.layer_norm1[i](tensor)
            tensor = tensor + self.ffns[i](tensor)
            tensor = self.layer_norm2[i](tensor)
            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
        return (tensor, hidden_states, attentions)

In [None]:
class XMLModel__():
    def __init__(self,config):
        super().__init__(config)
        self.transformer = XLMModel(config)
        self.mlp = nn.Sequential(
            nn.Linear(128*2048,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,1)
        )
    def forward(
                self,
            input_ids=None,
            attention_mask=None,
            langs=None,
            token_type_ids=None,
            position_ids=None,
            lengths=None,
            cache=None,
            head_mask=None,
            inputs_embeds=None,
            labels = None
    ):        
        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            lengths=lengths,
            cache=cache,
            head_mask=head_mask,
        )
        output = transformer_outputs[0]

        logit = self.mlp(output.reshape(-1,128 * 2048))
        
        loss_fn = MSELoss()
        loss = loss_fn(logit.view(-1),labels.view(-1))
        return loss,logit

In [18]:
def read_annotated_file(path, index="index"):
    indices = []
    originals = []
    translations = []
    z_means = []
    with open(path, mode="r", encoding="utf-8-sig") as csvfile:
        reader = csv.DictReader(csvfile, delimiter="\t", quoting=csv.QUOTE_NONE)
        for row in reader:
            indices.append(row[index])
            originals.append(row["original"])
            translations.append(row["translation"])
            z_means.append(float(row["z_mean"]))

    return pd.DataFrame(
        {'index': indices,
         'original': originals,
         'translation': translations,
         'z_mean': z_means
         })


def read_test_file(path, index="index"):
    indices = []
    originals = []
    translations = []
    with open(path, mode="r", encoding="utf-8-sig") as csvfile:
        reader = csv.DictReader(csvfile, delimiter="\t", quoting=csv.QUOTE_NONE)
        for row in reader:
            indices.append(row[index])
            originals.append(row["original"])
            translations.append(row["translation"])

    return pd.DataFrame(
        {'index': indices,
         'original': originals,
         'translation': translations,
         })

min_max_scaler = preprocessing.MinMaxScaler()


def fit(df, label):
    x = df[[label]].values.astype(float)
    x_scaled = min_max_scaler.fit_transform(x)
    df[label] = x_scaled
    return df
TRAIN_FILE = "/content/drive/MyDrive/Transquest/data/train.ende.df.short.tsv"
DEV_FILE = "/content/drive/MyDrive/Transquest/data/dev.ende.df.short.tsv"
TEST_FILE = "/content/drive/MyDrive/Transquest/data/test20.ende.df.short.tsv"
train = read_annotated_file(TRAIN_FILE)
dev = read_annotated_file(DEV_FILE)
test = read_test_file(TEST_FILE)
train = train[['original', 'translation', 'z_mean']]
dev = dev[['original', 'translation', 'z_mean']]
test = test[['index', 'original', 'translation']]

index = test['index'].to_list()
train = train.rename(columns={'original': 'text_a', 'translation': 'text_b', 'z_mean': 'labels'}).dropna()
dev = dev.rename(columns={'original': 'text_a', 'translation': 'text_b', 'z_mean': 'labels'}).dropna()
test = test.rename(columns={'original': 'text_a', 'translation': 'text_b'}).dropna()

test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'].to_list())))

train = fit(train, 'labels')
dev = fit(dev, 'labels')
train_df, eval_df = train_test_split(train, test_size=0.1, random_state=777)
train

Unnamed: 0,text_a,text_b,labels
0,José Ortega y Gasset visited Husserl at Freibu...,1934 besuchte José Ortega y Gasset Husserl in ...,0.842105
1,"However, a disappointing ninth in China meant ...",Eine enttäuschende Neunte in China bedeutete j...,0.685766
2,"In his diary, Chase wrote that the release of ...","In seinem Tagebuch, Chase schrieb, dass die Ve...",0.518692
3,Heavy arquebuses mounted on wagons were called...,Schwere Arquebuses auf Waggons montiert wurden...,0.655494
4,Once North Pacific salmon die off after spawni...,Sobald der nordpazifische Lachs nach dem Laich...,0.770374
...,...,...,...
6995,Some may also discourage or disallow unsanitar...,Einige können auch unhygienische Praktiken wie...,0.786245
6996,"In the late 1860s, the crinolines disappeared ...",In den späten 1860er Jahren verschwanden die K...,0.759626
6997,"Disco was criticized as mindless, consumerist,...","Disco wurde als geistlos, konsumistisch, überp...",0.625879
6998,Planters would then fill large hogsheads with ...,Die Pflanzer würden dann große Heuschrecken mi...,0.713105


In [None]:
train_df

Unnamed: 0,text_a,text_b,labels
4548,Unite in a single struggle against the fascist...,Eint euch in einem einzigen Kampf gegen den fa...,0.778310
6141,Harlan’s hawks usually have faint streaks on t...,Harlan Funde Falken haben in der Regel schwach...,0.723191
2420,Todd blames Patrick for the miscarriage and fr...,Todd beschuldigt Patrick für die Fehlgeburt un...,0.786648
5165,"Muskie retired to his home in Bethesda, Maryla...","Muskie zog sich 1981 nach Bethesda, Maryland, ...",0.774550
5294,Whereas the human spinal cord ends at the firs...,Während das menschliche Rückenmark beim ersten...,0.738736
...,...,...,...
2119,"""Easter, 1916"", a poem by the poet and playwri...","""Ostern 1916"", Gedicht des Dichters und Dramat...",0.791697
4017,He exhibits both Superman's abilities as well ...,Er zeigt sowohl Superman 's Fähigkeiten als au...,0.684787
2982,He commanded a squadron of obsolescent biplane...,Während der griechischen Kampagne im Zweiten W...,0.716973
6959,"When the Red Third Regiment defected, he order...","Als das Rote Dritte Regiment ausbrach, ordnete...",0.784999


In [None]:
class Inputobj(object):
    def __init__(self,guid,text_a,text_b,label):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
class InputFeatures(object):
    def __init__(self,input_id,input_mask,label_id):
        self.input_id = input_id
        self.input_mask = input_mask
        self.label_id = label_id

def convert(inputobj,tokenizer):
    token_a = tokenizer.tokenize(inputobj.text_a)
    token_b = tokenizer.tokenize(inputobj.text_b)
    token = token_a + [0,0,0,0] + token_b
    input_id = tokenizer.convert_tokens_to_ids(token)
    iput_mask = [1] * len(input_id)

    maxlength = 128
    pad_lenght = maxlength - len(input_id)
    input_id += [0]*pad_lenght
    iput_mask += [0]*pad_lenght
    return InputFeatures(
        input_id = input_id,
        input_mask = iput_mask,
        label_id = inputobj.label
    )
def load(input_objs):
    features = [convert(input_obj,tokenizer) for input_obj in tqdm(input_objs)]
    all_input_id = torch.tensor([f.input_id for f in features],dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],dtype=torch.long)
    all_label = torch.tensor([f.label_id for f in features],dtype=torch.float)
    dataset = TensorDataset(all_input_id,all_input_mask,all_label)
    return dataset

In [None]:
train_examples = [
                        Inputobj(i, text_a, text_b, label)
                        for i, (text_a, text_b, label) in enumerate(
                            zip(train_df["text_a"].astype(str), train_df["text_b"].astype(str), train_df["labels"])
                        )
                    ]

In [None]:
eval_examples = [
                        Inputobj(i, text_a, text_b, label)
                        for i, (text_a, text_b, label) in enumerate(
                            zip(eval_df["text_a"].astype(str), eval_df["text_b"].astype(str), eval_df["labels"])
                        )
                    ]

In [None]:
load([train_examples[0]])

  0%|          | 0/1 [00:00<?, ?it/s]

<torch.utils.data.dataset.TensorDataset at 0x7fb2123c3890>

# Make dataset

In [None]:
num_train_epochs = 0
epoch_to_train = 3
train_examples = [
    Inputobj(i, text_a, text_b, label)
    for i, (text_a, text_b, label) in enumerate(
        zip(train_df["text_a"].astype(str), train_df["text_b"].astype(str), train_df["labels"])
    )
]
train_dataset = load(train_examples)

  0%|          | 0/6300 [00:00<?, ?it/s]

In [None]:
eval_examples = [
    Inputobj(i, text_a, text_b, label)
    for i, (text_a, text_b, label) in enumerate(
        zip(eval_df["text_a"].astype(str), eval_df["text_b"].astype(str), eval_df["labels"])
    )
]
eval_dataset = load(eval_examples)

  0%|          | 0/700 [00:00<?, ?it/s]

In [None]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=8,
    num_workers=0,
)


In [None]:
eval_sampler = RandomSampler(eval_dataset)
eval_dataloader = DataLoader(
    eval_dataset,
    sampler=eval_sampler,
    batch_size=8,
    num_workers=0,
)


# Train

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
model = XMLModel__(configuration)
model.to(device)

In [None]:
def get_inputs_dict_cpu(batch):
    if isinstance(batch[0], dict):
        inputs = {key: value.squeeze() for key, value in batch[0].items()}
        inputs["labels"] = batch[1]
    else:
        batch = tuple(t for t in batch)

        inputs = {"input_ids": batch[0], "attention_mask": batch[1],"labels":batch[2]}
    
    return inputs

In [None]:
def get_inputs_dict(batch):
    if isinstance(batch[0], dict):
        inputs = {key: value.squeeze().to(device) for key, value in batch[0].items()}
        inputs["labels"] = batch[1].to(device)
    else:
        batch = tuple(t.to(device) for t in batch)

        inputs = {"input_ids": batch[0], "attention_mask": batch[1],"labels":batch[2]}
    
    return inputs

In [None]:
optimizer_grouped_parameters = []
custom_parameter_names = set()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters.extend([{
            "params": [p for n, p in model.named_parameters() if n not in any(nd in n for nd in no_decay)],
            "weight_decay": 0,
        },
        {
            "params": [p for n, p in model.named_parameters() if n not in any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
)

In [None]:
train_iterator = trange(3, desc="Epoch", mininterval=0)
optimizer = AdamW(params=optimizer_grouped_parameters,lr=1e-5,eps=1e-8)
loss_logger = []
for _ in train_iterator:
    num_train_epochs += 1
    batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {num_train_epochs} of {epoch_to_train}",
                mininterval=0,
            )
    acc_loss=0
    for step, batch in enumerate(batch_iterator):
        inputs = get_inputs_dict(batch)
        optimizer.zero_grad()
        loss,pred = model(**inputs)
        acc_loss += loss.item()
        if step % 50 == 0 and step != 0:
            print("Loss for last 50 step: ",acc_loss / 50)
            loss_logger.append(acc_loss)
            acc_loss = 0
        loss.backward()
        optimizer.step()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]



Running Epoch 4 of 3:   0%|          | 0/788 [00:00<?, ?it/s]

Loss for last 50 step:  0.005179809213150293
Loss for last 50 step:  0.007271254730876535
Loss for last 50 step:  0.007669006767100655
Loss for last 50 step:  0.00767713611247018
Loss for last 50 step:  0.006467962909955532
Loss for last 50 step:  0.01228592068888247
Loss for last 50 step:  0.006824234834057279
Loss for last 50 step:  0.009953202055767179
Loss for last 50 step:  0.005639701124164276
Loss for last 50 step:  0.005487351508345455
Loss for last 50 step:  0.005000545899383724
Loss for last 50 step:  0.006893630739068612
Loss for last 50 step:  0.00681942532537505
Loss for last 50 step:  0.0074210032820701596
Loss for last 50 step:  0.0065467971132602545


Running Epoch 5 of 3:   0%|          | 0/788 [00:00<?, ?it/s]

Loss for last 50 step:  0.006742433729814365
Loss for last 50 step:  0.00656072256504558
Loss for last 50 step:  0.0062048958439845595
Loss for last 50 step:  0.007020108090946451
Loss for last 50 step:  0.007529025145340711
Loss for last 50 step:  0.005570421252923552
Loss for last 50 step:  0.009573434856720268
Loss for last 50 step:  0.009024281345773488
Loss for last 50 step:  0.005046434628311545
Loss for last 50 step:  0.007048257152782753
Loss for last 50 step:  0.006724091008072719
Loss for last 50 step:  0.006851620989618823
Loss for last 50 step:  0.008397618449525907
Loss for last 50 step:  0.007971607851795853
Loss for last 50 step:  0.007953798378584906


Running Epoch 6 of 3:   0%|          | 0/788 [00:00<?, ?it/s]

Loss for last 50 step:  0.010259291696129366
Loss for last 50 step:  0.0046569944033399225
Loss for last 50 step:  0.010399358139839023
Loss for last 50 step:  0.00783132437383756
Loss for last 50 step:  0.003822579066618346
Loss for last 50 step:  0.007697078639175743
Loss for last 50 step:  0.006537603976321407
Loss for last 50 step:  0.005673880093963817
Loss for last 50 step:  0.007742077191360295
Loss for last 50 step:  0.004723515888908878
Loss for last 50 step:  0.00584861867595464
Loss for last 50 step:  0.006261139345588163
Loss for last 50 step:  0.0053042378986719995
Loss for last 50 step:  0.007031543590128422
Loss for last 50 step:  0.009382630471372977


In [None]:
model = XMLModel__(configuration)
model.load_state_dict(torch.load('/content/drive/MyDrive/Transquest/model/model.pth'))

In [None]:
model.to(device)
model.eval()

In [None]:
from scipy.stats import pearsonr, spearmanr
global_pred = []
global_labels = []
batch_iterator = tqdm(
            eval_dataloader,
            desc=f"Running eval",
            mininterval=0,
        )
with torch.no_grad():
    for step, batch in enumerate(batch_iterator):
        inputs = get_inputs_dict(batch)
        _,pred = model(**inputs)
        label = inputs["labels"]
        global_pred.append(pred)
        global_labels.append(label)



Running eval:   0%|          | 0/88 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Transquest/model/model.pth")

In [None]:
!sudo fuser /dev/nvidia*


/dev/nvidia0:         3099m
/dev/nvidiactl:       3099m
/dev/nvidia-uvm:      3099m


In [None]:
!kill -9 3099

In [50]:
tokenier_de(train_df.iloc[0]["text_b"].lower)

Eint euch in einem einzigen Kampf gegen den faschistischen Eindringling.