# Text Quality Assessment

In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [2]:
import torch
from tqdm import tqdm
from datasets import load_dataset, Features, Value

In [3]:
fp = "workspace/dev/dataset/01_dataprep/appvocai_discover-01_dataprep-02_feature-review-dataset.parquet/*.parquet"
features = Features(
    {
        "id": Value("string"),
        "category": Value("string"),
        "content": Value("string"),
        "tqa_has_adjective": Value("bool"),
        "tqa_has_adverb": Value("bool"),
        "tqa_has_determiner": Value("bool"),
        "tqa_has_noun": Value("bool"),
        "tqa_has_terminal_punctuation": Value("bool"),
        "tqa_has_verb": Value("bool"),
        "tqa_high_digit_ratio": Value("bool"),
        "tqa_high_punctuation_ratio": Value("bool"),
        "tqa_word_count_range": Value("bool"),
        "tqa_readability_easy": Value("bool"),
        "tqa_readability_std": Value("bool"),
        "tqa_readability_difficult": Value("bool"),
        "tqa_stop_word_match": Value("bool"),
        "tqa_first_letter_cap": Value("bool"),
        "tqa_no_all_caps": Value("bool"),
        "tqa_high_word_repetition": Value("bool"),
        "tqa_no_special_chars": Value("bool"),
    }
)

## Load GPT-2

In [4]:
device = "cuda"
model_id = "openai-community/gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  state_dict = torch.load(resolved_archive_file, map_location="cpu")


## Load Dataset

In [7]:
ds = load_dataset("parquet", data_files=fp, features=features)
encodings = tokenizer("\n\n".join(ds["content"]), return_tensors="pt")

ValueError: Instruction "train" corresponds to no data!

In [None]:
encodings[0]

## Perplexity on Fixed Length Model

In [None]:
max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())