In [1]:
!pip install openkiwi
!pip install torchtext==0.8.0 torch==1.7.0 pytorch-lightning==0.8.5


Collecting openkiwi
  Downloading openkiwi-2.1.0-py3-none-any.whl (186 kB)
[?25l[K     |█▊                              | 10 kB 20.0 MB/s eta 0:00:01[K     |███▌                            | 20 kB 9.9 MB/s eta 0:00:01[K     |█████▎                          | 30 kB 8.4 MB/s eta 0:00:01[K     |███████                         | 40 kB 7.8 MB/s eta 0:00:01[K     |████████▉                       | 51 kB 4.7 MB/s eta 0:00:01[K     |██████████▌                     | 61 kB 5.6 MB/s eta 0:00:01[K     |████████████▎                   | 71 kB 5.5 MB/s eta 0:00:01[K     |██████████████                  | 81 kB 5.4 MB/s eta 0:00:01[K     |███████████████▉                | 92 kB 6.0 MB/s eta 0:00:01[K     |█████████████████▋              | 102 kB 5.4 MB/s eta 0:00:01[K     |███████████████████▎            | 112 kB 5.4 MB/s eta 0:00:01[K     |█████████████████████           | 122 kB 5.4 MB/s eta 0:00:01[K     |██████████████████████▉         | 133 kB 5.4 MB/s eta 0:00:01[K

In [2]:
from transformers import BertModel
bert = BertModel.from_pretrained('bert-base-multilingual-cased')

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [3]:
from transformers import BertTokenizer, BertModel
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert = BertModel.from_pretrained(model_name)

batch_sentence = ['This.', 'This.', 'Thissentecne.']

input_enc = tokenizer(batch_sentence, padding=True, truncation=True, return_tensors='pt')
print(input_enc)

output = bert(**input_enc)
last_hidden_state = output[0]
feature = last_hidden_state.mean(dim=1)
print(feature.shape)

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

{'input_ids': tensor([[  101, 10747,   119,   102,     0,     0,     0],
        [  101, 10747,   119,   102,     0,     0,     0],
        [  101, 10747, 30832, 15953, 10238,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1]])}
torch.Size([3, 768])


In [4]:
print(output[0].shape) ## only use output[0][0]

torch.Size([3, 7, 768])


In [5]:
print(output[0][0])

tensor([[ 0.3251, -0.1474,  0.5211,  ..., -0.2515,  0.1768,  0.1284],
        [ 0.0993, -0.1326,  0.9122,  ..., -0.2344,  0.2030,  0.1319],
        [ 0.3425, -0.1206,  0.7036,  ..., -0.2463, -0.1433,  0.0766],
        ...,
        [ 0.1638, -0.0412,  0.9428,  ..., -0.3129,  0.1093,  0.0201],
        [ 0.3387, -0.0936,  0.8877,  ..., -0.3077,  0.0774,  0.1036],
        [ 0.3463, -0.1507,  0.7943,  ..., -0.2378,  0.1089,  0.1300]],
       grad_fn=<SelectBackward>)


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import csv
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
from torch import nn
from transformers.optimization import AdamW
from torch import optim

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



In [9]:
def read_annotated_file(path, index="index"):
    indices = []
    originals = []
    translations = []
    z_means = []
    with open(path, mode="r", encoding="utf-8-sig") as csvfile:
        reader = csv.DictReader(csvfile, delimiter="\t", quoting=csv.QUOTE_NONE)
        for row in reader:
            indices.append(row[index])
            originals.append(row["original"])
            translations.append(row["translation"])
            z_means.append(float(row["z_mean"]))

    return pd.DataFrame(
        {'index': indices,
         'original': originals,
         'translation': translations,
         'z_mean': z_means
         })

def read_test_file(path, index="index"):
    indices = []
    originals = []
    translations = []
    with open(path, mode="r", encoding="utf-8-sig") as csvfile:
        reader = csv.DictReader(csvfile, delimiter="\t", quoting=csv.QUOTE_NONE)
        for row in reader:
            indices.append(row[index])
            originals.append(row["original"])
            translations.append(row["translation"])

    return pd.DataFrame(
        {'index': indices,
         'original': originals,
         'translation': translations,
         })

In [10]:
TRAIN_FILE = "/content/drive/MyDrive/en-de/train.ende.df.short.tsv"
DEV_FILE = "/content/drive/MyDrive/en-de/dev.ende.df.short.tsv"
TEST_FILE = "/content/drive/MyDrive/en-de/test20.ende.df.short.tsv"
train_data = read_annotated_file(TRAIN_FILE)
dev_data = read_annotated_file(DEV_FILE)
test_data = read_annotated_file(TEST_FILE)
train_data = train_data[['original', 'translation', 'z_mean']]
dev_data = dev_data[['original', 'translation', 'z_mean']]
test_data = test_data[['original', 'translation', 'z_mean']]

In [11]:
class QEDataset(Dataset):
    def __init__(self, myDataset):
        self.dataset = myDataset
        self.original = myDataset['original']
        self.translation = myDataset['translation']
        self.z_mean = myDataset['z_mean']

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return (self.original[index], self.translation[index], self.z_mean[index])


class QEDataLoader(DataLoader):
    def __init__(self, data_dir, batch_size):
        dataset = QEDataset(data_dir)
        super().__init__(dataset, batch_size=batch_size, collate_fn=self.collate_fn)
        return

    def collate_fn(self, batch):
        src_batch, trg_batch, score_batch = [], [], []
        for src_sample, trg_sample, score_sample in batch:
            src_batch.append(src_sample)
            trg_batch.append(trg_sample)
            score_batch.append(score_sample)
        # src_batch = pad_sequence(
        #     src_batch, padding_value=PAD_IDX, batch_first=True)
        # trg_batch = pad_sequence(
        #     trg_batch, padding_value=PAD_IDX, batch_first=True)
        # score_batch = torch.tensor(score_batch).unsqueeze(-1)
        return src_batch, trg_batch, score_batch


In [12]:
class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class BertQE(nn.Module):
    def __init__(self, hidden_dim=768, model_name="bert-base-multilingual-cased"):
        super().__init__()
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.bert = BertModel.from_pretrained(model_name)
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.output = Mlp(hidden_dim*2, hidden_dim*2, 1, drop=0.1)

    @staticmethod
    def merge_input(source, target):
        merge_sentence = source + target
        return merge_sentence
    
    def forward(self, batch_sentence):
        merge_sentence = self.merge_input(**batch_sentence)
        input_enc = self.tokenizer(merge_sentence, padding=True, truncation=True, return_tensors='pt')
        input_enc = input_enc.to(device)
        output = self.bert(**input_enc)

        last_hidden_state = output[0] # bs*2 L C
        batch_size = last_hidden_state.shape[0] // 2
        source_hidden_state, target_hidden_state = torch.split(last_hidden_state, batch_size, dim=0)
        source_feature = self.linear1(source_hidden_state).mean(dim=1, keepdim=True) # bs L C -> bs 1 C
        target_feature = self.linear1(target_hidden_state).mean(dim=1, keepdim=True) # bs L C -> bs 1 C
        feature = torch.cat([source_feature, target_feature], dim=-1) # bs 1 2C
        output = self.output(feature).reshape(-1) # bs 1 1
        return output


In [31]:
batch_size = 32

# Create data loaders.
train_dataloader = QEDataLoader(train_data, batch_size=batch_size)
dev_dataloader = QEDataLoader(dev_data, batch_size=batch_size)
test_dataloader = QEDataLoader(test_data, batch_size=batch_size)
loss_fn = nn.MSELoss()
model = BertQE().to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [14]:
torch.cuda.empty_cache()

In [16]:
epoc = 1

torch.cuda.empty_cache()

def train():
    for i in range(epoc):
        epoch_loss = 0.0
        count = 0
        for batch, (src_batch, trg_batch, score_batch) in enumerate(train_dataloader):
            batch_sentence = dict()
            batch_sentence['source'] = src_batch
            batch_sentence['target'] = trg_batch
            output = model(batch_sentence)
            # print(output)
            # print(type(output))
            # print(type(score_batch))
            score_batch = torch.Tensor(score_batch)
            score_batch = score_batch.to(device)
            loss = loss_fn(output, score_batch)
            epoch_loss = epoch_loss + loss.item()
            count = count + 1
            if count % 100 == 0:
                print("Loss in the " + str(count) + " batch is " + str(loss.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print("Loss in the " + str(count) + " batch is " + str(epoch_loss / count))            

if __name__ == '__main__':
    train()


Loss in the 100 batch is tensor(2.0223, device='cuda:0', grad_fn=<MseLossBackward>)
Loss in the 200 batch is tensor(0.6614, device='cuda:0', grad_fn=<MseLossBackward>)
Loss in the 219 batch is 0.7008488316037883


In [38]:
def test(dataloader, model, loss_fn):
    model.eval()
    epoch_loss = 0
    count = 0
    res = torch.ones(1).to(device)
    
    with torch.no_grad():
        for batch, (src_batch, trg_batch, score_batch) in enumerate(dataloader):
          batch_sentence = dict()
          batch_sentence['source'] = src_batch
          batch_sentence['target'] = trg_batch
          output = model(batch_sentence)
          res = torch.cat((res,output)) 
          # print(output)
          # print(type(output))
          # print(type(score_batch))
          score_batch = torch.Tensor(score_batch).to(device)
          loss = loss_fn(output, score_batch)
          epoch_loss = epoch_loss + loss.item()
          count = count + 1
    print("Loss in the " + str(count) + " batch is " + str(epoch_loss / count))
    return res

if __name__ == '__main__':
    res = test(test_dataloader, model,loss_fn)

Loss in the 32 batch is 0.4840969294309616


In [39]:
from scipy import stats

res = res[1:]
res = res.cpu()
res = res.detach().numpy()

pr = stats.pearsonr(res,test_data['z_mean'])
sp = stats.spearmanr(res,test_data['z_mean'])
pr = pr[0]
sp = sp[0]

print(pr)
print(sp)

(1000,)
(1000,)
-0.038087580409256326
-0.11010945035542923


In [None]:
from kiwi.lib.pretrain import pretrain_from_file
import kiwi

In [None]:
path = "MyDrive/Datasets/shared"
model_path = path + "/estimator_en_de.torch"
data_path = "/content/drive/MyDrive/Datasets/shared/en-de/train.ende.df.short.tsv"
en_de_df = pd.read_csv(data_path, sep='\t')


In [None]:
ori = en_de_df.get("original").to_numpy()
trans = en_de_df.get("translation").to_numpy()
tar_mean = en_de_df.get("mean").to_numpy()

In [None]:
def save_config(yaml_config, name):
    """ Writes yaml config to file"""
    with open(name, 'w') as outfile:
        yaml.dump(yaml_config, outfile, default_flow_style=False)
save_config("/content/drive/MyDrive/Datasets/shared/nuqe.yaml",'nuqe_config.yml')

In [None]:
config = '/content/drive/MyDrive/Datasets/shared/nuqe.yaml'
run_info = kiwi.lib.train.train_from_file(config)