In [3]:
from pathlib import Path
import json
import random

from itertools import product, permutations

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
IV127 = Path('/content/drive/My Drive/IV127')
filenames = list((IV127 / 'english').iterdir())

In [5]:
def get_object_from_string(row):
    try:
        return eval(row)['sentences']
    except SyntaxError:
        return None

system_ps_problem = filenames[0]
system_ps = filenames[1]
stavba_vet = filenames[2]

system_ps_df = pd.read_csv(system_ps, sep=';')
system_ps_df = system_ps_df[system_ps_df.exercise == 60].loc[:, ['id', 'url']].rename(columns={"id": "ps"})
system_ps_problem_df = pd.read_csv(system_ps_problem, sep=';')
stavba_vet_df = pd.read_csv(stavba_vet, sep=';', converters={'sentence': get_object_from_string}, usecols=['id', 'sentence'])

In [6]:
mapping = system_ps_df.merge(system_ps_problem_df, how='inner', on='ps')
df = mapping.merge(stavba_vet_df, how='inner', left_on='problem', right_on='id').drop(['id_x', 'id_y', 'ps'],axis=1)
df['level'] = df['url'].str.extract(pat=r'-(\d)-*')
df['topic'] = df['url'].str.replace('doplnovani-slovicek-', '').str.replace('-\d-*', '').str.replace('uroven', '')
df['fullname'] = df['url'].str.replace('doplnovani-slovicek-', '')
df = df.drop('url', axis=1)

In [7]:
original_size = len(df)
df.head()

Unnamed: 0,problem,sentence,level,topic,fullname
0,42,"[{'sentence': 'Small children are very _ .', '...",2,pridavna-jmena,pridavna-jmena-2-uroven
1,43,"[{'sentence': 'Petrol is no longer a _ fuel.',...",2,pridavna-jmena,pridavna-jmena-2-uroven
2,44,"[{'sentence': 'He is a very _ swimmer.', 'tran...",2,pridavna-jmena,pridavna-jmena-2-uroven
3,45,"[{'sentence': 'He used to be a _ man.', 'trans...",2,pridavna-jmena,pridavna-jmena-2-uroven
4,46,"[{'sentence': 'Iron is more _ than gold.', 'tr...",2,pridavna-jmena,pridavna-jmena-2-uroven


In [8]:
df = df.dropna(axis=1, how='all').dropna()
cleaned_size = len(df)
print(f'Cannot process {original_size - cleaned_size} rows from {original_size} rows.')

Cannot process 3 rows from 656 rows.


In [9]:
df_exploded = df.explode('sentence')
df_exploded['solutions'] = df_exploded.apply(lambda row: row.sentence['solution'][0], axis=1)
df_exploded['sentence'] = df_exploded.apply(lambda row: row.sentence['sentence'], axis=1)
solutions = df_exploded.groupby('problem').agg(list)['solutions'].to_frame()
merged = solutions.merge(df_exploded, on='problem', left_index=True, suffixes=('_agg', None))
df = merged.copy(deep=True)

In [10]:
df.head()

Unnamed: 0,problem,solutions_agg,sentence,level,topic,fullname,solutions
18,13,"[bathroom, kitchen, garden, carpet]",The _ is at the end of the hall.,2,dum-casti-domu,dum-casti-domu-2-uroven,bathroom
18,13,"[bathroom, kitchen, garden, carpet]","Tom is in the _ , cutting up some vegetables.",2,dum-casti-domu,dum-casti-domu-2-uroven,kitchen
18,13,"[bathroom, kitchen, garden, carpet]",He grows tomatoes in the _ .,2,dum-casti-domu,dum-casti-domu-2-uroven,garden
18,13,"[bathroom, kitchen, garden, carpet]",The dog is sitting on the _ .,2,dum-casti-domu,dum-casti-domu-2-uroven,carpet
19,14,"[mirror, floor, chair, shower]",Look at yourself in the _ .,2,dum-casti-domu,dum-casti-domu-2-uroven,mirror


In [11]:
grouped = df.groupby('problem')[['problem', 'sentence', 'solutions', 'fullname']].agg({'problem': list, 'sentence': list, 'solutions': list, 'fullname': 'first'})

### Random assigner


In [None]:
from numpy import random as rnd

In [None]:
df = merged.copy(deep=True)
df['pred'] =  df.apply(lambda row: row.solutions_agg[random.randint(0, len(row.solutions_agg) - 1)], axis=1)
df.to_csv(IV127 / 'random.csv', index=False)
(df['pred'] == df.solutions).sum() / len(df) * 100

28.61247947454844

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.apply(lambda row: rnd.permutation(row.solutions), axis=1)
exploded = df.apply(pd.Series.explode)
exploded.to_csv(IV127 / 'random_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

26.888341543513956

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

8.422664624808576

## word2vec-google-news-300

In [None]:
!pip install autocorrect

Collecting autocorrect
[?25l  Downloading https://files.pythonhosted.org/packages/a0/71/eb8c1f83439dfe6cbe1edb03be1f1110b242503b61950e7c292dd557c23e/autocorrect-2.2.2.tar.gz (621kB)
[K     |▌                               | 10kB 9.9MB/s eta 0:00:01[K     |█                               | 20kB 2.8MB/s eta 0:00:01[K     |█▋                              | 30kB 3.4MB/s eta 0:00:01[K     |██                              | 40kB 3.6MB/s eta 0:00:01[K     |██▋                             | 51kB 3.2MB/s eta 0:00:01[K     |███▏                            | 61kB 3.6MB/s eta 0:00:01[K     |███▊                            | 71kB 3.8MB/s eta 0:00:01[K     |████▏                           | 81kB 4.2MB/s eta 0:00:01[K     |████▊                           | 92kB 4.6MB/s eta 0:00:01[K     |█████▎                          | 102kB 4.3MB/s eta 0:00:01[K     |█████▉                          | 112kB 4.3MB/s eta 0:00:01[K     |██████▎                         | 122kB 4.3MB/s eta 0:00:

In [None]:
import string

import gensim.downloader as api
from scipy.spatial.distance import cosine

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
spell = Speller('en')
preprocessor = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
wv = api.load('word2vec-google-news-300')

gb2us = {
    'neighbour': 'neighbor',
    'favourite': 'favorite',
    'theatre' : 'theater',
    'colour': 'color',
    'occupate': 'occupy',
    'moustache': 'mustache',
    'pyjamas': 'pajamas',
    'travelling': 'traveling',
    'cosy': 'cozy',
    'neighbourhood': 'neighborhood',
    'apologised':'apologized',
}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def process_sentence(sentence):
    tokens = word_tokenize(sentence.lower().translate(preprocessor))
    filtered = [token for token in tokens if not token in stop_words]
    americized = [gb2us[token] if token in gb2us.keys() else token for token in filtered]
    vectors = np.array([wv[token] for token in americized if token in wv.vocab])
    if vectors.size > 0:
        return vectors.mean(axis=0)
    
    vectors = np.array([wv[token] for token in tokens if token in wv.vocab])
    if vectors.size > 0:
        return vectors.mean(axis=0)
    
    tokens = [spell(token) if token not in wv.vocab and spell(token) in wv.vocab else token for token in tokens]
    vectors = np.array([wv[token] for token in tokens if token in wv.vocab])
    if vectors.size > 0:
        return vectors.mean(axis=0)
    return np.nan
    

def get_predictions(row):
    sentence_vector = process_sentence(row.sentence)
    solutions = np.array([cosine(process_sentence(sol), sentence_vector) for sol in row.solutions_agg])
    return row.solutions_agg[solutions.argmin()]

In [None]:
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_predictions, axis=1)
df.to_csv(IV127 / 'word2vec.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




In [None]:
(df['pred'] == df.solutions).sum() / len(df) * 100

61.16584564860427

### Bijection

In [None]:
def get_bijection(row):
    sentences_vectors = [process_sentence(s) for s in row.sentence]
    solutions_vectors = [process_sentence(s) for s in row.solutions]

    assert len(solutions_vectors) == len(sentences_vectors)
    num = len(solutions_vectors)
    matrix = np.zeros((num, num))
    for row_ in range(num):
        for col in range(num):
            matrix[row_][col] = cosine(sentences_vectors[row_], solutions_vectors[col])
    
    rows = list(range(num))
    minimum_cols = None
    minimum_value = float('inf')
    for perm in list(permutations(range(num))):
        current_value = sum([matrix[row_, col] for row_, col in zip(rows, perm)])
        if minimum_value >= current_value:
            minimum_value = current_value
            minimum_cols = perm

    predictions = [row.solutions[col] for col in minimum_cols]
    return predictions

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / 'word2vec_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




71.26436781609196

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

56.04900459418071

In [None]:
del wv

## Install Transformers package

In [12]:
!pip install transformers
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

device='cuda'

def load_model(model_name):
    model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer, model

def score(sentence, word):
    num_masks = len(tokenizer.tokenize(word))
    labels = sentence.replace('_', word)
    mask = sentence.replace('_', ' [MASK] ' * num_masks)

    tensor_labels = torch.tensor(tokenizer.encode(labels)).unsqueeze(0).to(device)
    tensor_mask = torch.tensor(tokenizer.encode(mask)).unsqueeze(0).to(device)
    with torch.no_grad():
        loss = model(tensor_mask, labels = tensor_labels)[0]
    return np.exp(loss.item())

def get_prediction(row):
    return row.solutions_agg[np.array([score(row.sentence, sol) for sol in row.solutions_agg]).argmin()]

from itertools import permutations

def get_bijection(row):
    assert len(row.sentence) == len(row.solutions)
    num = len(row.solutions)
    matrix = np.zeros((num, num))
    for row_ in range(num):
        for col in range(num):
            matrix[row_][col] = score(row.sentence[row_], row.solutions[col])    
    rows = list(range(num))
    minimum_cols = None
    minimum_value = float('inf')
    for perm in list(permutations(range(num))):
        current_value = sum([matrix[row_, col] for row_, col in zip(rows, perm)])
        if minimum_value >= current_value:
            minimum_value = current_value
            minimum_cols = perm

    predictions = [row.solutions[col] for col in minimum_cols]
    return predictions

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 12.6MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 51.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 49.6MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K    

## DistilBert

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer, model = load_model(model_name)
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'{model_name}.csv', index=False)
(df['pred'] == df.solutions).sum() / len(df) * 100

HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




79.55665024630541

#### Bijection

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'{model_name}_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




85.71428571428571

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

76.5696784073507

## base Bert

In [None]:
model_name = 'bert-base-uncased'
tokenizer, model = load_model(model_name)
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'{model_name}.csv', index=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




In [None]:
(df['pred'] == df.solutions).sum() / len(df) * 100

78.7766830870279

#### Bijection

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'{model_name}_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




84.97536945812809

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

76.41653905053599

## large Bert

In [None]:
model_name = 'bert-large-uncased'
tokenizer, model = load_model(model_name)
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'{model_name}.csv', index=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=434.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




In [None]:
(df['pred'] == df.solutions).sum() / len(df) * 100

81.32183908045977

#### Bijection

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'{model_name}_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




85.71428571428571

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

77.6416539050536

## Bart

In [None]:
model_name = 'facebook/bart-large'
tokenizer, model = load_model(model_name)
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'{model_name.split("/")[-1]}.csv', index=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1525.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1018571383.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




In [None]:
(df['pred'] == df.solutions).sum() / len(df) * 100

73.23481116584564

#### Bijection

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'{model_name.split("/")[-1]}_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




87.5615763546798

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

79.93874425727412

## Roberta tokenization initialize

In [13]:
def score(sentence, word):
    sentence = " ".join(sentence.split())
    w = len(tokenizer.encode(sentence.replace('_', word)))
    wout = len(tokenizer.encode(sentence))
    num_masks = w - wout + 1
    labels = sentence.replace('_', word)
    mask = sentence.replace('_', '<mask>' * num_masks)
    
    tensor_labels = torch.tensor(tokenizer.encode(labels)).unsqueeze(0).to(device)
    tensor_mask = torch.tensor(tokenizer.encode(mask)).unsqueeze(0).to(device)
    with torch.no_grad():
        loss = model(tensor_mask, labels = tensor_labels)[0]
    return np.exp(loss.item())

def get_prediction(row):
    return row.solutions_agg[np.array([score(row.sentence, sol) for sol in row.solutions_agg]).argmin()]

torch.cuda.empty_cache()
model = None
tokenizer = None
!nvidia-smi

Sun Nov  1 15:08:07 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     7W /  75W |     10MiB /  7611MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Roberta

In [None]:
model_name = 'roberta-base'
tokenizer, model = load_model(model_name)
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'{model_name}.csv', index=False)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




In [None]:
(df['pred'] == df.solutions).sum() / len(df) * 100

82.63546798029556

#### Bijection

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'{model_name}_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




92.03612479474549

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

86.98315467075038

## Roberta Large

In [None]:
model_name = 'roberta-large'
tokenizer, model = load_model(model_name)

In [None]:
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'{model_name}.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




In [None]:
(df['pred'] == df.solutions).sum() / len(df) * 100

85.09852216748769

#### Bijection

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'{model_name}_bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




94.33497536945814

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

90.50535987748852

## Longformer

In [15]:
torch.cuda.empty_cache()
model = None
tokenizer = None
!nvidia-smi

Sun Nov  1 15:08:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     7W /  75W |     10MiB /  7611MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [16]:
model_name = 'allenai/longformer-large-4096'
tokenizer, model = load_model(model_name)
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'{model_name.split("/")[-1]}.csv', index=False)
(df['pred'] == df.solutions).sum() / len(df) * 100

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=803.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1742910431.0, style=ProgressStyle(descr…




Some weights of LongformerForMaskedLM were not initialized from the model checkpoint at allenai/longformer-large-4096 and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




83.00492610837439

#### Bijection

In [17]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'{model_name.split("/")[-1]}-bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




92.32348111658456

In [18]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100

87.4425727411945

## GPT-2

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = 'cuda'
model_id = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_id)

def score(sentence, word):
    sentence = " ".join(sentence.split())
    input_ids = tokenizer(sentence.replace("_", word), return_tensors='pt')['input_ids'].to(device)

    with torch.no_grad():
        index = [w for w in sentence.split()].index("_")
        target_ids = input_ids.clone()
        # target_ids[:, :index] = -100
        # target_ids[:, index + 1:] = -100
        loss = model(input_ids, labels=target_ids)[0]
    return np.exp(loss.item())

def get_prediction(row):
    return row.solutions_agg[np.array([score(row.sentence, sol) for sol in row.solutions_agg]).argmin()]

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [None]:
df = merged.copy(deep=True)
df['pred'] = df.progress_apply(get_prediction, axis=1)
df.to_csv(IV127 / f'gpt2.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=2436.0), HTML(value='')))




In [None]:
(df['pred'] == df.solutions).sum() / len(df) * 100

80.95238095238095

### Bijection

In [None]:
df = grouped.copy(deep=True)
df['pred'] = df.progress_apply(get_bijection, axis=1)
exploded = df.progress_apply(pd.Series.explode)
exploded.to_csv(IV127 / f'gpt2-bijection.csv', index=False)
(exploded['pred'] == exploded.solutions).sum() / len(exploded) * 100

NameError: ignored

In [None]:
df.apply(lambda row: all([pred == sol for pred, sol in zip(row.pred, row.solutions)]),axis=1).sum() / len(df) * 100