# Running Transformers
This Notebook serves to run our fine-tuned transformer models.
All models can be found under models/.

## Imports and libraries

In [1]:
from transformers import AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
import os

from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import Dataset

Below is our train data merged with prompts

In [2]:
data = pd.read_csv('./data/merged.csv')
data.head()


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,8a31b8cc1996,3b9047,In the social pyramid of ancient Egypt the pha...,-0.077267,0.424365,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
1,4387107feb4d,3b9047,The ancient Egyptian system of government was ...,1.376083,2.389443,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,3b784d0a5c8f,3b9047,Nobles were the only ont that could hold gover...,0.467722,-0.085653,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,1b2ff4d4edd9,3b9047,They were many different social classes. The p...,-0.012957,-0.40948,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,108049c01946,3b9047,The ancient Egyptian system of goverment is in...,2.20464,-0.645344,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...


## Selecting a transformer and tokenizer
Inside models you have to choose the transformer and then the checkpoint to initialize it

Make sure to also select the fitting tokenizer as well.

In [3]:
# Replace the path for the transformer and tokenizer you want to run
#TRANSFORMER_PATH = './models/deberta-v3-large-epoch-3/checkpoint-1504'
TRANSFORMER_PATH = './models/deberta-v3-base/deberta-v3-base/checkpoint-4012'
#TOKENIZER_PATH = 'microsoft/deberta-v3-large'
TOKENIZER_PATH = 'microsoft/deberta-v3-base'
MAX_LENGTH = 1024


transformer = AutoModelForSequenceClassification.from_pretrained(TRANSFORMER_PATH, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)


Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



## Tokenize a single data point


In [4]:
def tokenize_encode(text, prompt_question, prompt_text):
    prompt = f'Evaluate the content and wording score of this summary: {tokenizer.sep_token} {text} {tokenizer.sep_token} The summary must answer the following prompt: {prompt_question} {tokenizer.sep_token} The prompt is related towards the following original text: {prompt_text}'

    encoded = tokenizer(
        prompt, 
        truncation=True, 
        padding="max_length", 
        # Adjust max length to fitted model
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    return encoded

# Replace with summary text
text = data.text[0]
# Replace with prompt_question
prompt_question = data.prompt_question[0]
# Replace with prompt_text
prompt_text = data.prompt_text[0]

print(tokenize_encode(text, prompt_question, prompt_text))

{'input_ids': tensor([[    1, 38081,   262,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


## Running the transformer for a single data point

In [5]:
output = transformer(**tokenize_encode(text, prompt_question, prompt_text))
output.logits.reshape(-1).tolist()

[0.10300207883119583, 0.34845003485679626]

## Running the transformer on pandas frame
Adapt the code for your needs

In [7]:
content_preds = []
wording_preds = []

# Replace data for your own data. Here we use the first 5 rows from the merged.csv
for index, row in tqdm(data[:5].iterrows(), total=data[:5].shape[0]):
    inputs = tokenize_encode(row.text, row.prompt_question, row.prompt_text)
    # Content predicting
    outputs = transformer(**inputs).logits.reshape(-1).tolist()
    
    content_preds.append(outputs[0])
    wording_preds.append(outputs[1])

submission_df = pd.DataFrame({'content': content_preds, 'wording': wording_preds})
submission_df.head()

100%|██████████| 5/5 [00:12<00:00,  2.58s/it]


Unnamed: 0,content,wording
0,0.103002,0.34845
1,1.332854,1.27206
2,0.460646,-0.344012
3,-0.159789,-0.498712
4,1.436758,-0.587502
