## Importing libraries and data

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import pandas as pd
import torch

In [2]:
summaries_df = pd.read_csv('../data/v2/train/train_data.csv')
summaries_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,8a31b8cc1996,3b9047,In the social pyramid of ancient Egypt the pha...,-0.077267,0.424365
1,8c9411cfc953,39c16e,Aristotle claims that an ideal tragedy should ...,0.55907,-0.634924
2,4387107feb4d,3b9047,The ancient Egyptian system of government was ...,1.376083,2.389443
3,d720eb53c270,ebad26,They put pickle in them to mask the smell of r...,0.297031,-0.168734
4,e887883b946c,ebad26,"""whenever meat was so spoiled that it could no...",-0.093814,0.503833


In [3]:
prompts_df = pd.read_csv('../kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
prompts_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [4]:
tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-QQP")


In [5]:
merged_df = pd.merge(summaries_df, prompts_df, on='prompt_id')
merged_df

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,8a31b8cc1996,3b9047,In the social pyramid of ancient Egypt the pha...,-0.077267,0.424365,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
1,4387107feb4d,3b9047,The ancient Egyptian system of government was ...,1.376083,2.389443,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,3b784d0a5c8f,3b9047,Nobles were the only ont that could hold gover...,0.467722,-0.085653,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,1b2ff4d4edd9,3b9047,They were many different social classes. The p...,-0.012957,-0.409480,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,108049c01946,3b9047,The ancient Egyptian system of goverment is in...,2.204640,-0.645344,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
...,...,...,...,...,...,...,...,...
5727,d8ae24010bf9,814d6b,The Third Wave experiment was a huge success i...,3.005642,3.226292,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
5728,e32a76187192,814d6b,The Third Wave developed over such a short tim...,0.997243,1.880386,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
5729,2c6d06ca1eea,814d6b,The experiment developed over such a short per...,-0.693773,-0.490571,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
5730,47d4807beb66,814d6b,It was easy for the students to follow this be...,-0.093814,0.503833,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


In [6]:
summaries_df.describe()

Unnamed: 0,content,wording
count,5732.0,5732.0
mean,-0.016807,-0.067199
std,1.04253,1.032348
min,-1.729859,-1.962614
25%,-0.799545,-0.87272
50%,-0.093814,-0.081769
75%,0.49943,0.503833
max,3.802722,4.310693


## Splitting data

In [7]:
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

## Tokenization

In [8]:
train_texts = train_df['text'].tolist()
train_tokenized = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')

In [9]:
test_texts = test_df['text'].tolist()
test_tokenized = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')


In [10]:
train_inputs = {
    'input_ids': train_tokenized['input_ids'],
    'attention_mask': train_tokenized['attention_mask']
}

test_inputs = {
    'input_ids': test_tokenized['input_ids'],
    'attention_mask': test_tokenized['attention_mask']
}

# Prepare the outputs (content and wording scores)
train_outputs = {
    'content_score': train_df['content'].tolist(),
    'wording_score': train_df['wording'].tolist()
}

test_outputs = {
    'content_score': test_df['content'].tolist(),
    'wording_score': test_df['wording'].tolist()
}

## Model selection

In [11]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, AdamW

# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-QQP", num_labels=2)  # 2 labels: content score and wording score

# Define a custom PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.inputs.items()}, {key: val[idx] for key, val in self.outputs.items()}

# Create DataLoader instances for training and testing sets
batch_size = 32

train_dataset = CustomDataset(train_inputs, train_outputs)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_inputs, test_outputs)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


## Model training

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss
optimizer = AdamW(model.parameters(), lr=2e-5)

# Train the model
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = {key: val.to(device) for key, val in batch[1].items()}

        outputs = model(**inputs)
        loss = criterion(outputs.logits, torch.stack([labels['content_score'], labels['wording_score']], dim=1).float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {average_loss}")

# Save the trained model
model.save_pretrained('trained_model')




KeyboardInterrupt: 

In [19]:
model.eval()
total_mse_content = 0
total_mse_wording = 0
total_samples = len(test_dataloader.dataset)

with torch.no_grad():
    for batch in test_dataloader:
        inputs = {key: val.to(device) for key, val in batch[0].items()}
        labels = {key: val.to(device) for key, val in batch[1].items()}

        outputs = model(**inputs)
        mse_content = criterion(outputs.logits[:, 0], labels['content_score'])
        mse_wording = criterion(outputs.logits[:, 1], labels['wording_score'])

        total_mse_content += mse_content.item()
        total_mse_wording += mse_wording.item()

average_mse_content = total_mse_content / total_samples
average_mse_wording = total_mse_wording / total_samples

print(f"Mean Squared Error (Content): {average_mse_content}")
print(f"Mean Squared Error (Wording): {average_mse_wording}")

Mean Squared Error (Content): 0.006737831331947305
Mean Squared Error (Wording): 0.010979420961784522


In [24]:
from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

text = 'when the meat was actually spoiled they would just mix it with the other meat so none would know. They would also put pickles in the meat and put soda on the meat to give it a sweet taste. They would also use boarx and geltain to make it looked smoked'

pipe(text)



[[{'label': 'LABEL_0', 'score': 0.3680143356323242},
  {'label': 'LABEL_1', 'score': 0.6319857239723206}]]