In [29]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoTokenizer, AutoModel
#from transformers import DebertaTokenizer, DebertaModel
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneGroupOut

from tqdm import tqdm as tq
from tqdm import notebook

In [30]:
import random

seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

In [31]:
sum_train = pd.read_csv(
    "Dataset/commonlit-evaluate-student-summaries/summaries_train.csv"
)
sum_train

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757
...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128


In [32]:
sum_train.prompt_id.nunique()

4

In [33]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
sep = tokenizer.sep_token

In [34]:
import pickle

with open("Models/bert_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [35]:
tkn = tokenizer.tokenize("Hello, student 2023!")
tkn

['Hello', ',', 'Ġstudent', 'Ġ20', '23', '!']

In [36]:
sum_train["len_text"] = sum_train.text.apply(
    lambda x: len(tokenizer.tokenize(x)))
max_len = sum_train.len_text.max()
q_97 = sum_train.len_text.quantile(q=0.97)
sum_train.head(3)

Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,student_id,prompt_id,text,content,wording,len_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,69
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,56
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,291


In [37]:
prompts_train = pd.read_csv(
    "Dataset/commonlit-evaluate-student-summaries/prompts_train.csv"
)
prompts_train["prompt_text_token"] = prompts_train.prompt_text.apply(
    lambda x: tokenizer.tokenize(x)[:1200]
)
prompts_train["prompt_text_ids"] = prompts_train.prompt_text_token.apply(
    lambda x: [1] + tokenizer.convert_tokens_to_ids(x) + [2]
)
# prompts_train['prompt_q_token']=prompts_train.prompt_question.apply(lambda x: tokenizer.tokenize(x)[:1200])
# prompts_train['prompt_q_ids']=prompts_train.prompt_q_token.apply(lambda x: [1]+tokenizer.convert_tokens_to_ids(x)+[2])
prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,prompt_text_token,prompt_text_ids
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,"[Chapter, Ġ13, Ġ, č, Ċ, As, Ġthe, Ġsequel, Ġto...","[1, 45642, 508, 1437, 50121, 50118, 1620, 5, 1..."
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,"[Egypt, ian, Ġsociety, Ġwas, Ġstructured, Ġlik...","[1, 37552, 811, 2313, 21, 16697, 101, 10, 3334..."
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,"[Background, Ġ, č, Ċ, The, ĠThird, ĠWave, Ġexp...","[1, 48277, 1437, 50121, 50118, 133, 7470, 2118..."
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...","[With, Ġone, Ġmember, Ġtrim, ming, Ġbeef, Ġin,...","[1, 3908, 65, 919, 10723, 7059, 6829, 11, 10, ..."


In [38]:
prompts_train["len_prompt"] = prompts_train.prompt_text.apply(
    lambda x: len(tokenizer.tokenize(x))
)
prompts_train.len_prompt.max(), prompts_train.len_prompt.min()

(1199, 700)

In [39]:
model = AutoModel.from_pretrained("microsoft/deberta-base")

In [40]:
with open("Models/deberta_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [41]:
def make_sentence(x):
    # model.eval()
    model.eval
    text = torch.LongTensor(x).unsqueeze(dim=0)
    with torch.no_grad():
        # enc_layer, _=model(text)
        last_layer = model(text).last_hidden_state
        # poll_out=model(text).pooler_output
    # return torch.mean(enc_layer[11], 1).squeeze(dim=0)
    # return(poll_out.squeeze(dim=0))
    return torch.mean(last_layer, 1).squeeze(dim=0)
    # return model_seq(text).logits.squeeze()

In [42]:
prompts_train["prompt_text_sent"] = prompts_train["prompt_text_ids"].apply(
    make_sentence
)
# prompts_train['prompt_q_sent']=prompts_train['prompt_q_ids'].apply(make_sentence)

In [43]:
train_content = pd.merge(
    sum_train, prompts_train, how="left", left_on="prompt_id", right_on="prompt_id"
)
train_content.isnull().sum()

student_id           0
prompt_id            0
text                 0
content              0
wording              0
len_text             0
prompt_question      0
prompt_title         0
prompt_text          0
prompt_text_token    0
prompt_text_ids      0
len_prompt           0
prompt_text_sent     0
dtype: int64

In [44]:
train_content["len_ratio"] = train_content.len_text / train_content.len_prompt

In [45]:
train_c = train_content.drop(
    [
        "prompt_text",
        "prompt_title",
        "prompt_question",
        "prompt_text_ids",
        "prompt_text_token",
        "len_text",
        "len_prompt",
    ],
    axis=1,
)
train_c.head(3)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_text_sent,len_ratio
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.098571
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,"[tensor(-0.6038), tensor(0.0009), tensor(-0.00...",0.046706
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.392713


In [46]:
from sklearn.model_selection import train_test_split

SPLIT_SEED = 92

# data_train, data_val = train_test_split(train_c, test_size=0.20, random_state=SPLIT_SEED)
prompt_list = list(prompts_train["prompt_id"])
data_train = train_c[
    (train_c.prompt_id == prompt_list[0])
    | (train_c.prompt_id == prompt_list[1])
    | (train_c.prompt_id == prompt_list[3])
].copy()

data_val = train_c[(train_c.prompt_id == prompt_list[2])].copy()

data_train["split"] = "train"
data_val["split"] = "val"
data_with_split = pd.concat([data_train, data_val], ignore_index=True)
data_with_split

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_text_sent,len_ratio,split
0,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,"[tensor(-0.6038), tensor(0.0009), tensor(-0.00...",0.046706,train
1,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.392713,train
2,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.053981,train
3,0071d51dab6d,ebad26,They would use chemicals and substances to cha...,0.205683,0.380538,"[tensor(-0.6038), tensor(0.0009), tensor(-0.00...",0.039199,train
4,0072b649a88c,3b9047,The Egyptian society is really different from ...,0.205683,0.380538,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.121457,train
...,...,...,...,...,...,...,...,...
7160,fe1e3c528e24,814d6b,The third wave experiment developed quick...,3.020803,2.421200,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.252857,val
7161,fe6fac61dc49,814d6b,Mr jones started the third wave as a expereme...,1.221089,2.269070,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.120000,val
7162,fed33a5f383e,814d6b,The Third Wave gained over 200 members by the ...,2.141224,1.123777,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.238571,val
7163,fefd4f143fbe,814d6b,The Third Wave developed over such a short tim...,-0.782641,-0.245970,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.044286,val


In [47]:
class TextDataset(Dataset):
    def __init__(self, text_df, max_seq_length):

        self.text_df = text_df

        self._max_seq_length = max_seq_length

        self.train_df = self.text_df[self.text_df.split == "train"]
        self.train_size = len(self.train_df)

        self.val_df = self.text_df[self.text_df.split == "val"]
        self.validation_size = len(self.val_df)

        self._lookup_dict = {
            "train": (self.train_df, self.train_size),
            "val": (self.val_df, self.validation_size),
        }

        self.set_split("train")

    def set_split(self, split="train"):
        self._data_split = split
        self._data_df, self._data_size = self._lookup_dict[split]

    def __len__(self):
        return self._data_size

    def __getitem__(self, index):

        row = self._data_df.iloc[index]
        text = row["text"]
        tokens = tokenizer.tokenize(text)
        text_index = [1] + tokenizer.convert_tokens_to_ids(tokens) + [2]
        token_index = text_index

        if len(token_index) < self._max_seq_length:
            pad = [0] * (self._max_seq_length - len(token_index))
            token_index = token_index + pad

        else:
            token_index = token_index[: self._max_seq_length]

        data_vector = torch.LongTensor(token_index)

        target = row[["content", "wording"]]

        return {
            "x_data": data_vector,
            "attention_mask": (data_vector != 0).long(),
            "content_vector": row["prompt_text_sent"],
            "y_target": torch.squeeze(torch.FloatTensor([target])),
            "len_ratio": row["len_ratio"],
        }

    def get_num_batches(self, batch_size):

        return len(self) // batch_size

In [48]:
data = TextDataset(data_with_split, 350)

In [49]:
t = data.__getitem__(0)

  "y_target": torch.squeeze(torch.FloatTensor([target])),


In [50]:
t["x_data"].shape, t["content_vector"].shape, t["y_target"].shape, t[
    "attention_mask"
].shape

(torch.Size([350]), torch.Size([768]), torch.Size([2]), torch.Size([350]))

In [51]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(
        dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last
    )

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)

        yield out_data_dict

In [52]:
gen = generate_batches(data, 3)

In [53]:
g = next(gen)
g["x_data"].shape, g["y_target"].shape, g["len_ratio"].shape

  "y_target": torch.squeeze(torch.FloatTensor([target])),
  "y_target": torch.squeeze(torch.FloatTensor([target])),
  "y_target": torch.squeeze(torch.FloatTensor([target])),


(torch.Size([3, 350]), torch.Size([3, 2]), torch.Size([3]))

In [54]:
class BertForSequenceRegression(nn.Module):

    def __init__(self, num_marks=2):
        super(BertForSequenceRegression, self).__init__()
        self.num_marks = num_marks
        with open("Models/deberta_model.pkl", "rb") as f:
            self.bert = pickle.load(f)

        self.hidden_1 = nn.Linear(
            2 * config.hidden_size, 2 * config.hidden_size)
        self.notline_1 = nn.ReLU()
        self.dropout_1 = nn.Dropout(config.hidden_dropout_prob)
        self.hidden_2 = nn.Linear(2 * config.hidden_size, config.hidden_size)
        self.notline_2 = nn.ReLU()

        self.dropout_2 = nn.Dropout(config.hidden_dropout_prob)
        self.hidden = nn.Linear(config.hidden_size, 128)
        self.regres = nn.Linear(128, num_marks)

    def forward(
        self,
        input_ids,
        content_vector,
        token_type_ids=None,
        attention_mask=None,
        labels=None,
    ):
        output_bert = self.bert(
            input_ids, token_type_ids, attention_mask
        ).last_hidden_state.mean(1)
        h_concat = torch.cat((content_vector, output_bert), dim=1)
        hidden_vec_1 = self.hidden_1(h_concat)
        hidden_drop_1 = self.dropout_1(hidden_vec_1)
        hidden_vecn_1 = self.notline_1(hidden_drop_1)
        hidden_vec_2 = self.hidden_2(hidden_vecn_1)
        hidden_drop_2 = self.dropout_2(hidden_vec_2)
        hidden_vecn_2 = self.notline_2(hidden_drop_2)
        hidden = self.hidden(hidden_vecn_2)
        marks = self.regres(hidden)

        return marks

In [55]:
config = Namespace(hidden_dropout_prob=0.05, hidden_size=768)

In [56]:
model = BertForSequenceRegression()
model

BertForSequenceRegression(
  (bert): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermedia

In [57]:
import torch
import pickle
import numpy as np

# from transformers import BertTokenizer, BertForSequenceRegression

# Define the device to be CPU
device = torch.device("cpu")

# Load the tokenizer using pickle
with open("Models/bert_tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Load the trained model
model = BertForSequenceRegression().cpu()  # Ensure model is on CPU
model.load_state_dict(
    torch.load(
        "Models/deb_model_3.pt",
        map_location=device,
    )
)
model.eval()

# Sample text
text = "This is a test summary"

# Tokenize the text
tokens = tokenizer.tokenize(text)
token_index = (
    [tokenizer.cls_token_id]
    + tokenizer.convert_tokens_to_ids(tokens)
    + [tokenizer.sep_token_id]
)

# Pad or truncate the token index
max_length = 260
if len(token_index) < max_length:
    pad = [tokenizer.pad_token_id] * (max_length - len(token_index))
    token_index = token_index + pad
else:
    token_index = token_index[:max_length]

# Convert token index to tensor
data_vector = torch.LongTensor(token_index).unsqueeze(dim=0).to(device)

# Generate content vector (random for illustration)
content_vector = torch.randn(1, 768).to(device)

# Generate attention mask
attention_mask = (data_vector != 0).long()

# Perform prediction
with torch.no_grad():
    scores = model(data_vector, content_vector, attention_mask=attention_mask)

# Parse the output tensor to extract content and wording scores
content_score, wording_score = scores[:, 0], scores[:, 1]

# Print the predictions
print("Content Score:", content_score.item())
print("Wording Score:", wording_score.item())


def convert_to_positive(score):
    """
    Converts a score to a positive value between 0 and 1.

    Args:
        score: The score to convert.

    Returns:
        The converted score.
    """
    return 1 / (1 + np.exp(-score))


# Convert content and wording scores to positive values
positive_content_score = convert_to_positive(content_score.item())
positive_wording_score = convert_to_positive(wording_score.item())

# Print the converted scores
print("Positive Content Score:", positive_content_score)
print("Positive Wording Score:", positive_wording_score)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Content Score: -0.7733938694000244
Wording Score: -0.7640351057052612
Positive Content Score: 0.3157454018473356
Positive Wording Score: 0.31777084289582486
