In [None]:
!pip install spacy bert-score tqdm
!python -m spacy download en_core_web_sm

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
import spacy
from bert_score import score as bert_score
import numpy as np
import math
import json
from statistics import mean
from tqdm.auto import tqdm


nlp = spacy.load("en_core_web_sm")


def build_entity_grid(doc):
    """
    Строит решетку сущностей: строки — предложения, столбцы — сущности,
    ячейки — роль (S, O, -)
    Роль S — если сущность является подлежащим (упрощенно: первого по порядку токена), иначе O.
    """
    entities = []
    for ent in doc.ents:
        if ent.text not in entities:
            entities.append(ent.text)
    grid = []
    for sent in doc.sents:
        row = []
        first_token = sent[0].text
        for ent_text in entities:
            if ent_text in sent.text:
                if first_token in ent_text:
                    row.append('S')
                else:
                    row.append('O')
            else:
                row.append('-')
        grid.append(row)
    return grid


def cohere_score(grid):
    transitions = []
    for i in range(len(grid) - 1):
        for j in range(len(grid[0])):
            r1, r2 = grid[i][j], grid[i+1][j]
            if r1 != '-' and r2 != '-':
                transitions.append(math.log(1.0 + 1e-12))
    if not transitions:
        return float('-inf')
    return sum(transitions) / len(transitions)


def compute_cohere_score(context, generated):
    text = context + " " + generated
    doc = nlp(text)
    grid = build_entity_grid(doc)
    score = cohere_score(grid)
    return score / len(list(doc.sents))


def evaluate_dataset(path, desc="Dataset"):
    with open(path, 'r') as f:
        data = json.load(f)['rocstories']
    cs_list, bert_list = [], []
    for item in tqdm(data, desc=f"Evaluating {desc}", unit="item"):
        trg = item.get('TRG', '')
        sentences = [s.strip() for s in trg.split('.') if s.strip()]
        context = '. '.join(sentences[:4]) + '.'
        gen = item.get('GEN', '')
        # CohereScore
        cs_list.append(compute_cohere_score(context, gen))
        # BERTScore F1
        P, R, F1 = bert_score(
            [gen], [trg],
            lang='en',
            model_type='microsoft/deberta-v3-small',
            rescale_with_baseline=False,
            device='cpu'
        )
        bert_list.append(F1[0].item())
    return mean(cs_list), mean(bert_list)

full_path = '/content/150000-N=50-len=1.json'
light_path = '/content/10000-N=25-len=1.json'

full_cs, full_bert = evaluate_dataset(full_path, desc="Full (150k)")
light_cs, light_bert = evaluate_dataset(light_path, desc="Light (10k)")

print(f"\nМетрики для полного обучения (150k итераций):\n  CohereScore = {full_cs:.4f}, BERTScore F1 = {full_bert:.4f}")
print(f"\nМетрики для облегченного обучения (10k итераций):\n  CohereScore = {light_cs:.4f}, BERTScore F1 = {light_bert:.4f}")

Evaluating Full (150k):   0%|          | 0/5000 [00:00<?, ?item/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Evaluating Light (10k):   0%|          | 0/1000 [00:00<?, ?item/s]


Метрики для полного обучения (150k итераций):
  CohereScore = -inf, BERTScore F1 = 0.7209

Метрики для облегченного обучения (10k итераций):
  CohereScore = -inf, BERTScore F1 = 0.6996


In [None]:
import spacy
from bert_score import score as bert_score
import numpy as np
import math
import json
from statistics import mean
from tqdm.auto import tqdm

nlp = spacy.load("en_core_web_sm")


def build_entity_grid(doc):
    """
    Строит решетку сущностей: строки — предложения, столбцы — сущности,
    ячейки — роль (S, O, -)
    Роль S — если сущность является подлежащим (упрощенно: первого по порядку токена), иначе O.
    """
    entities = []
    for ent in doc.ents:
        if ent.text not in entities:
            entities.append(ent.text)
    grid = []
    for sent in doc.sents:
        row = []
        first_token = sent[0].text
        for ent_text in entities:
            if ent_text in sent.text:
                if first_token in ent_text:
                    row.append('S')
                else:
                    row.append('O')
            else:
                row.append('-')
        grid.append(row)
    return grid


def cohere_score(grid, allow_partial=False, partial_penalty=0.1):
    """
    Среднее лог-правдоподобие переходов сущностных ролей.
    Если allow_partial=True, учитываем переходы, где сущность присутствует хотя бы в одном предложении с штрафом.
    partial_penalty — вероятность для частичного совпадения.
    """
    transitions = []
    for i in range(len(grid) - 1):
        for j in range(len(grid[0])):
            r1, r2 = grid[i][j], grid[i+1][j]
            if r1 != '-' and r2 != '-':
                transitions.append(math.log(1.0 + 1e-12))
            elif allow_partial and (r1 != '-' or r2 != '-'):
                transitions.append(math.log(partial_penalty))
    if not transitions:
        return 0.0
    return sum(transitions) / len(transitions)

def compute_cohere_score(context, generated, allow_partial=False):
    text = context + " " + generated
    doc = nlp(text)
    print(f"Entities: {[ent.text for ent in doc.ents]}")
    grid = build_entity_grid(doc)
    score = cohere_score(grid, allow_partial=allow_partial)
    return score / len(list(doc.sents))

context = "Story sentence 1. Story sentence 2. Story sentence 3. Story sentence 4."
generated = "Proposed ending sentence."
cs = compute_cohere_score(context, generated, allow_partial=True)
print("CohereScore (partial):", cs)

Eval Full(150k):   0%|          | 0/5000 [00:00<?, ?it/s]

Entities: ['Jill', '20 pounds', 'three months', 'James', 'James', 'James']
Entities: ['Jim', 'three', 'Indiana', 'A year later', 'Joey', 'Cole', '1993']
Entities: ['first']
Entities: ['Jason', 'Jason', 'Jason']
Entities: ['Mary', 'Rita', 'Gina', 'Bill', '20 years', 'Bill']
Entities: ['Betsy', 'One', 'ten', 'one hour', 'months', 'Tim', 'Sunday', 'Serbian', 'Tim']
Entities: ['Judy', 'Jason', 'many months', 'Jason', 'Jason']
Entities: ['Bobby', 'F', 'Bobby', 'Bobby', 'Bobby', 'second', 'John', 'One', 'John']
Entities: ['first', 'Tina', 'Tina', 'Tina']
Entities: []
Entities: ['David', 'David', 'the next two lengthy weeks', 'A week later', 'Fred']
Entities: ['One day', 'one day']
Entities: ['last summer', 'Jill', 'Jill', 'Jill', 'Jill']
Entities: ['Barry', 'Barry', 'Anna', 'hours']
Entities: ['this week', 'Jake', 'Hanna', 'Hanna']
Entities: ['John', 'greenery', 'Joe', 'all day', 'Joe', 'that weekend']
Entities: ['Nick', 'one day', 'Nick', 'Nick', 'Kyle', 'first', 'One day', 'Kurt', 'Craig',

KeyboardInterrupt: 

In [None]:
#!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

In [None]:
!git clone https://github.com/facebookresearch/faiss.git
%cd faiss

Cloning into 'faiss'...
remote: Enumerating objects: 65322, done.[K
remote: Counting objects: 100% (32725/32725), done.[K
remote: Compressing objects: 100% (769/769), done.[K
remote: Total 65322 (delta 32415), reused 31956 (delta 31956), pack-reused 32597 (from 2)[K
Receiving objects: 100% (65322/65322), 229.48 MiB | 33.19 MiB/s, done.
Resolving deltas: 100% (59748/59748), done.
/content/faiss


In [None]:
!pip install faiss-gpu-cu11

Collecting faiss-gpu-cu11
  Downloading faiss_gpu_cu11-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-runtime-cu11>=11.8.89 (from faiss-gpu-cu11)
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cublas-cu11>=11.11.3.6 (from faiss-gpu-cu11)
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading faiss_gpu_cu11-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.8/47.8 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl (417.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl (875 kB)
[2K   [9

In [None]:
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
#!pip uninstall -y tensorflow && pip install tensorflow-cpu -q

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6be9b253662e72dfb8bc96aafc156b0f10f988d4e9acbb7a239b185b62a84f5c
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
!pip install evaluate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/487.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install bert-score -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m118.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import pandas as pd
import requests
import gc
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

import json
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import evaluate
from torch.nn import CrossEntropyLoss

import numpy as np
from collections import Counter
from transformers import GPT2LMHeadModel, GPT2Tokenizer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# tencdm

In [None]:
!git clone https://github.com/M0RJIQUE/tencdm.git
%cd tencdm

Cloning into 'tencdm'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects:   2% (1/38)[Kremote: Counting objects:   5% (2/38)[Kremote: Counting objects:   7% (3/38)[Kremote: Counting objects:  10% (4/38)[Kremote: Counting objects:  13% (5/38)[Kremote: Counting objects:  15% (6/38)[Kremote: Counting objects:  18% (7/38)[Kremote: Counting objects:  21% (8/38)[Kremote: Counting objects:  23% (9/38)[Kremote: Counting objects:  26% (10/38)[Kremote: Counting objects:  28% (11/38)[Kremote: Counting objects:  31% (12/38)[Kremote: Counting objects:  34% (13/38)[Kremote: Counting objects:  36% (14/38)[Kremote: Counting objects:  39% (15/38)[Kremote: Counting objects:  42% (16/38)[Kremote: Counting objects:  44% (17/38)[Kremote: Counting objects:  47% (18/38)[Kremote: Counting objects:  50% (19/38)[Kremote: Counting objects:  52% (20/38)[Kremote: Counting objects:  55% (21/38)[Kremote: Counting objects:  57% (22/38)[Kremote: Counting o

In [None]:
!pip install -r requirements.txt

Collecting config==0.5.1 (from -r requirements.txt (line 6))
  Downloading config-0.5.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting datasets==2.16.1 (from -r requirements.txt (line 7))
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate==0.4.0 (from -r requirements.txt (line 8))
  Downloading evaluate-0.4.0-py3-none-any.whl.metadata (9.4 kB)
Collecting huggingface_hub==0.21.4 (from -r requirements.txt (line 9))
  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
Collecting ml_collections==0.1.1 (from -r requirements.txt (line 10))
  Downloading ml_collections-0.1.1.tar.gz (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nltk==3.8.1 (from -r requirements.txt (line 11))
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting numpy==1.24.3 (from -r requirements.txt (line 

In [None]:
!pip install faiss-gpu

[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

In [None]:
!pip install ml_collections -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install datasets -q
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets

In [None]:
!python -m data.load --dataset_name='rocstories'

roc_stories.py: 100% 6.51k/6.51k [00:00<00:00, 33.8MB/s]
dataset_infos.json: 100% 1.33k/1.33k [00:00<00:00, 9.34MB/s]
The repository for wza/roc_stories contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wza/roc_stories.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
ROCStories__spring2016.csv: 100% 13.1M/13.1M [00:00<00:00, 48.3MB/s]
ROCStories_winter2017.csv: 100% 14.5M/14.5M [00:00<00:00, 78.5MB/s]
Generating train split: 98161 examples [00:09, 10030.80 examples/s]
Loading... (num_proc=30): 100% 98161/98161 [00:00<00:00, 262269.74 examples/s]
Saving the dataset (1/1 shards): 100% 88161/88161 [00:00<00:00, 311551.85 examples/s]
Saving the dataset (1/1 shards): 100% 10000/10000 [00:00<00:00, 268903.56 examples/s]


In [None]:
!python -m data.make_statistics --dataset_name='rocstories' --encoder_name='roberta-base'

2025-03-16 10:14:02.337973: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742120042.360251    1737 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742120042.366911    1737 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
config.json: 100% 570/570 [00:00<00:00, 4.28MB/s]
config.json: 100% 570/570 [00:00<00:00, 4.48MB/s]
tokenizer_config.json: 100% 49.0/49.0 [00:00<00:00, 337kB/s]
vocab.txt: 100% 213k/213k [00:00<00:00, 6.47MB/s]
tokenizer.json: 100% 436k/436k [00:00<00:00, 4.75MB/s]
model.safetensors: 100% 436M/436M [00:03<00:00, 133MB/s]
Dataset preprocessing (num_proc=30): 100% 88161/88161 [00:05<00:00, 16766.84 examples/s]
mean: ['0.098', '0.096', 

In [None]:
!python -m model.train_decoder --dataset_name='rocstories' --encoder_name='roberta-base'

2025-03-16 10:15:44.725824: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742120144.747152    2411 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742120144.753705    2411 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice: 2
[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy 

In [None]:
!torchrun --nproc_per_node=1 train_diffusion.py --dataset_name='rocstories' --encoder_name='roberta-base'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Dataset preprocessing (num_proc=30):  73% 64657/88161 [00:06<00:02, 10924.61 examples/s][A
Dataset preprocessing (num_proc=30):  77% 67595/88161 [00:06<00:01, 10889.52 examples/s][A
Dataset preprocessing (num_proc=30):  80% 70533/88161 [00:06<00:01, 10890.62 examples/s][A
Dataset preprocessing (num_proc=30):  83% 73471/88161 [00:06<00:01, 10953.89 examples/s][A
Dataset preprocessing (num_proc=30):  87% 76409/88161 [00:07<00:01, 10907.09 examples/s][A
Dataset preprocessing (num_proc=30):  90% 79347/88161 [00:07<00:00, 10972.57 examples/s][A
Dataset preprocessing (num_proc=30):  93% 82285/88161 [00:07<00:00, 10956.17 examples/s][A
Dataset preprocessing (num_proc=30):  97% 85223/88161 [00:08<00:00, 10968.34 examples/s][A
Dataset preprocessing (num_proc=30): 100% 88161/88161 [00:08<00:00, 10416.86 examples/s]
Dataset length: 88161
loss_x_0: 0.4475, grad_norm: 0.1323, :  74% 148433/200000 [17:13:43<4:23:53,  3.26it/s]


In [None]:
#import torch
#from transformers import T5ForConditionalGeneration, T5Tokenizer

#model_path = "/content/tencdm/datasets/rocstories/decoder-t5-base-128-transformer.pth"
#tokenizer_name = "t5-base"

#tokenizer = T5Tokenizer.from_pretrained(tokenizer_name)

#model = T5ForConditionalGeneration.from_pretrained(tokenizer_name)

#state_dict = torch.load(model_path, map_location="cpu")

#new_state_dict = {}
#for key, value in state_dict.items():
#    if key.startswith("module."):
#        key = key[len("module."):]
#    new_state_dict[key] = value

#model.load_state_dict(new_state_dict, strict=False)

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)
#model.eval()

#print("Урааааа")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
#model = AutoModel.from_pretrained('/content/tencdm/datasets/rocstories/decoder-t5-base-128-transformer.pth').to(device)

In [None]:
model = '/content/tencdm/datasets/rocstories/decoder-t5-base-128-transformer.pth'

In [None]:
def download_csv(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"Файл сохранен: {filename}")
    else:
        print(f"Ошибка загрузки {url}: {response.status_code}")

In [None]:
def calculate_perplexity(logits, inputs):
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = inputs["input_ids"][..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss = loss.view(shift_labels.size()).mean(dim=1)
    return torch.exp(loss).cpu().tolist()

In [None]:
import pandas as pd
import torch
import gc
from tqdm import tqdm

def evaluate_model(filepath, batch_size=4, is_test=False):
    df = pd.read_csv(filepath)
    correct = 0
    total = len(df)
    total_perplexity = 0
    rank_correct = 0

    with tqdm(total=len(df), desc="Оценка модели", unit="примеров") as pbar:
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i : i + batch_size]

            contexts = [
                f"{row['InputSentence3']} {row['InputSentence4']}" for _, row in batch.iterrows()
            ]
            endings1 = [row['RandomFifthSentenceQuiz1'] for _, row in batch.iterrows()]
            endings2 = [row['RandomFifthSentenceQuiz2'] for _, row in batch.iterrows()]

            inputs1 = tokenizer(contexts, endings1, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            inputs2 = tokenizer(contexts, endings2, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            with torch.no_grad():
                logits1 = model(input_ids=inputs1["input_ids"], attention_mask=inputs1["attention_mask"], decoder_input_ids=inputs1["input_ids"]).logits
                logits2 = model(input_ids=inputs2["input_ids"], attention_mask=inputs2["attention_mask"], decoder_input_ids=inputs2["input_ids"]).logits

            perplexity1 = calculate_perplexity(logits1, inputs1)
            perplexity2 = calculate_perplexity(logits2, inputs2)

            for j, row in enumerate(batch.itertuples()):
                if not is_test:
                    correct_answer = getattr(row, "AnswerRightEnding", None)
                    if correct_answer is None:
                        raise ValueError("Поле AnswerRightEnding отсутствует в DataFrame")

                    pred = 1 if perplexity1[j] < perplexity2[j] else 2
                    if pred == correct_answer:
                        correct += 1
                    total_perplexity += perplexity1[j] if correct_answer == 1 else perplexity2[j]

                    if (correct_answer == 1 and perplexity1[j] < perplexity2[j]) or \
                    (correct_answer == 2 and perplexity2[j] < perplexity1[j]):
                        rank_correct += 1

            del inputs1, inputs2, logits1, logits2
            torch.cuda.empty_cache()
            gc.collect()

            pbar.update(len(batch))

    if not is_test:
        accuracy = correct / total * 100
        avg_perplexity = total_perplexity / total
        rank_accuracy = rank_correct / total * 100

        print(f"Точность (Accuracy): {accuracy:.2f}%")
        print(f"Средняя перплексия (Perplexity): {avg_perplexity:.2f}")
        print(f"Rank Accuracy: {rank_accuracy:.2f}%")
    else:
        print(f"Обработка тестового набора завершена. Всего примеров: {total}")

In [None]:
VAL_URL = "https://goo.gl/XWjas1"
TEST_URL = "https://goo.gl/BcTtB4"

In [None]:
download_csv(VAL_URL, "val_set.csv")
download_csv(TEST_URL, "test_set.csv")

Файл сохранен: val_set.csv
Файл сохранен: test_set.csv


In [None]:
if __name__ == "__main__":
    print("Оценка на валидационном наборе:")
    evaluate_model("val_set.csv")

    print("Оценка на тестовом наборе:")
    evaluate_model("test_set.csv", is_test=True)

Оценка на валидационном наборе:


Оценка модели:   0%|          | 0/1571 [00:00<?, ?примеров/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Оценка модели: 100%|██████████| 1571/1571 [02:38<00:00,  9.93примеров/s]


Точность (Accuracy): 49.40%
Средняя перплексия (Perplexity): 97601.31
Rank Accuracy: 49.40%
Оценка на тестовом наборе:


Оценка модели: 100%|██████████| 1571/1571 [02:37<00:00,  9.96примеров/s]

Обработка тестового набора завершена. Всего примеров: 1571





# gpt-3.5

In [None]:
#from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruGPT-3.5-13B")
#model = AutoModelForCausalLM.from_pretrained("ai-forever/ruGPT-3.5-13B").to(device)

In [None]:
#from transformers import AutoTokenizer, AutoModelForCausalLM

#tokenizer = AutoTokenizer.from_pretrained("jondurbin/airoboros-gpt-3.5-turbo-100k-7b")
#model = AutoModelForCausalLM.from_pretrained("jondurbin/airoboros-gpt-3.5-turbo-100k-7b")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3medium_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3medium_based_on_gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

In [None]:
model.to(device)

model.safetensors:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [None]:
def download_csv(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"Файл сохранен: {filename}")
    else:
        print(f"Ошибка загрузки {url}: {response.status_code}")

In [None]:
def calculate_perplexity(logits, inputs):
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = inputs["input_ids"][..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss = loss.view(shift_labels.size()).mean(dim=1)
    return torch.exp(loss).cpu().tolist()

In [None]:
def evaluate_model(filepath, batch_size=4, is_test=False):
    df = pd.read_csv(filepath)
    correct = 0
    total = len(df)
    total_perplexity = 0
    rank_correct = 0

    with tqdm(total=len(df), desc="Оценка модели", unit="примеров") as pbar:
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i : i + batch_size]

            contexts = [
                f"{row['InputSentence3']} {row['InputSentence4']}" for _, row in batch.iterrows()
            ]
            endings1 = [row['RandomFifthSentenceQuiz1'] for _, row in batch.iterrows()]
            endings2 = [row['RandomFifthSentenceQuiz2'] for _, row in batch.iterrows()]

            inputs1 = tokenizer(contexts, endings1, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            inputs2 = tokenizer(contexts, endings2, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            with torch.no_grad():
                logits1 = model(input_ids=inputs1["input_ids"], attention_mask=inputs1["attention_mask"], decoder_input_ids=inputs1["input_ids"]).logits
                logits2 = model(input_ids=inputs2["input_ids"], attention_mask=inputs2["attention_mask"], decoder_input_ids=inputs2["input_ids"]).logits

            perplexity1 = calculate_perplexity(logits1, inputs1)
            perplexity2 = calculate_perplexity(logits2, inputs2)

            for j, row in enumerate(batch.itertuples()):
                if not is_test:
                    correct_answer = getattr(row, "AnswerRightEnding", None)
                    if correct_answer is None:
                        raise ValueError("Поле AnswerRightEnding отсутствует в DataFrame")

                    pred = 1 if perplexity1[j] < perplexity2[j] else 2
                    if pred == correct_answer:
                        correct += 1
                    total_perplexity += perplexity1[j] if correct_answer == 1 else perplexity2[j]

                    if (correct_answer == 1 and perplexity1[j] < perplexity2[j]) or \
                    (correct_answer == 2 and perplexity2[j] < perplexity1[j]):
                        rank_correct += 1

            del inputs1, inputs2, logits1, logits2
            torch.cuda.empty_cache()
            gc.collect()

            pbar.update(len(batch))

    if not is_test:
        accuracy = correct / total * 100
        avg_perplexity = total_perplexity / total
        rank_accuracy = rank_correct / total * 100

        print(f"Точность (Accuracy): {accuracy:.2f}%")
        print(f"Средняя перплексия (Perplexity): {avg_perplexity:.2f}")
        print(f"Rank Accuracy: {rank_accuracy:.2f}%")
    else:
        print(f"Обработка тестового набора завершена. Всего примеров: {total}")

VAL_URL = "https://goo.gl/XWjas1"
TEST_URL = "https://goo.gl/BcTtB4"

download_csv(VAL_URL, "val_set.csv")
download_csv(TEST_URL, "test_set.csv")

if __name__ == "__main__":
    print("Оценка на валидационном наборе:")
    evaluate_model("val_set.csv")

    print("Оценка на тестовом наборе:")
    evaluate_model("test_set.csv", is_test=True)

Файл сохранен: val_set.csv
Файл сохранен: test_set.csv
Оценка на валидационном наборе:


Оценка модели: 100%|██████████| 1571/1571 [02:21<00:00, 11.07примеров/s]


Точность (Accuracy): 53.60%
Средняя перплексия (Perplexity): 261025.30
Rank Accuracy: 53.60%
Оценка на тестовом наборе:


Оценка модели: 100%|██████████| 1571/1571 [02:23<00:00, 10.96примеров/s]

Обработка тестового набора завершена. Всего примеров: 1571





# BLEU ROUGE

In [None]:
file_path = "/content/150000-N=50-len=1.json"
with open(file_path, "r") as f:
    data = json.load(f)["rocstories"]

In [None]:
def compute_metrics(gen_texts, ref_texts):
    bleu_scores = []
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    smooth = SmoothingFunction().method1

    for gen, ref in zip(gen_texts, ref_texts):
        bleu = sentence_bleu([ref.split()], gen.split(), smoothing_function=smooth)
        bleu_scores.append(bleu)

        rouge_scores = scorer.score(gen, ref)
        rouge_1_scores.append(rouge_scores["rouge1"].fmeasure)
        rouge_2_scores.append(rouge_scores["rouge2"].fmeasure)
        rouge_l_scores.append(rouge_scores["rougeL"].fmeasure)

    return {
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-1": sum(rouge_1_scores) / len(rouge_1_scores),
        "ROUGE-2": sum(rouge_2_scores) / len(rouge_2_scores),
        "ROUGE-L": sum(rouge_l_scores) / len(rouge_l_scores),
    }

In [None]:
generated_texts = [entry["GEN"] for entry in data]
reference_texts = [entry["TRG"] for entry in data]

In [None]:
metrics = compute_metrics(generated_texts, reference_texts)

In [None]:
print("BLEU:", round(metrics["BLEU"], 4))
print("ROUGE-1:", round(metrics["ROUGE-1"], 4))
print("ROUGE-2:", round(metrics["ROUGE-2"], 4))
print("ROUGE-L:", round(metrics["ROUGE-L"], 4))

BLEU: 0.0079
ROUGE-1: 0.1703
ROUGE-2: 0.0111
ROUGE-L: 0.1176


In [None]:
from bert_score import score

P, R, F1 = score(generated_texts, reference_texts, lang="en", model_type="bert-base-uncased")
print("BERTScore F1:", F1.mean().item())

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTScore F1: 0.47817757725715637


In [None]:
smooth = SmoothingFunction().method1
self_bleu_scores = []

for i, gen_text in enumerate(generated_texts[:1000]):
    refs = generated_texts[:i] + generated_texts[i+1:]
    score = sentence_bleu([r.split() for r in refs[:100]], gen_text.split(), smoothing_function=smooth)
    self_bleu_scores.append(score)

self_bleu = np.mean(self_bleu_scores)

In [None]:
def distinct_n(texts, n):
    ngrams = [tuple(text[i:i+n]) for text in texts for i in range(len(text)-n+1)]
    return len(set(ngrams)) / len(ngrams) if ngrams else 0

tokenized_texts = [nltk.word_tokenize(text.lower()) for text in generated_texts]
distinct_1 = distinct_n(tokenized_texts, 1)
distinct_2 = distinct_n(tokenized_texts, 2)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
model.eval()

def calculate_ppl(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return np.exp(loss)

ppl_scores = [calculate_ppl(text) for text in generated_texts[:100]]
avg_ppl = np.mean(ppl_scores)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [None]:
print(f"Self-BLEU: {self_bleu:.4f}")
print(f"Distinct-1: {distinct_1:.4f}")
print(f"Distinct-2: {distinct_2:.4f}")
print(f"Perplexity: {avg_ppl:.4f}")

Self-BLEU: 0.1010
Distinct-1: 0.0290
Distinct-2: 0.2422
Perplexity: 33.6940
