In [19]:
# 1. Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizerFast, DistilBertModel,
    T5Tokenizer, T5ForConditionalGeneration,
    AdamW
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Load and preprocess data
df = pd.read_csv("~/neuro140/sample - Sheet1.csv")
df['Score-Evidence (0-2)'] = df['Score-Evidence (0-2)'].clip(upper=2)
df['text'] = df['Question'].fillna('') + ' [SEP] ' + df['Response'].fillna('')

label_cols = [
    'Score-Thesis (0-1)',
    'Score-Contextualization (0-1)',
    'Score-Evidence (0-2)',
    'Score-Analysis and Reasoning (0-2)'
]

# 3. Tokenization and Dataset Preparation
scoring_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_encodings = scoring_tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings = scoring_tokenizer(list(val_df['text']), truncation=True, padding=True, max_length=512, return_tensors='pt')
train_labels = train_df[label_cols].values
val_labels = val_df[label_cols].values

class EssayDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EssayDataset(train_encodings, train_labels)
val_dataset = EssayDataset(val_encodings, val_labels)

# 4. Classifier Model Definition
class MultiOutputBertClassifier(nn.Module):
    def __init__(self, hidden_size=768):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc_thesis = nn.Linear(hidden_size, 2)
        self.fc_context = nn.Linear(hidden_size, 2)
        self.fc_evidence = nn.Linear(hidden_size, 3)
        self.fc_analysis = nn.Linear(hidden_size, 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0]
        return (
            self.fc_thesis(cls_token),
            self.fc_context(cls_token),
            self.fc_evidence(cls_token),
            self.fc_analysis(cls_token)
        )

model_classifier = MultiOutputBertClassifier().to(device)
optimizer = AdamW(model_classifier.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

def compute_accuracy(preds, labels):
    return (preds.argmax(dim=1) == labels).float().mean().item()

# 5. Classifier Training Loop
for epoch in range(3):
    model_classifier.train()
    total_loss, total_acc = 0, [0]*4

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        out_thesis, out_context, out_evidence, out_analysis = model_classifier(input_ids, attention_mask)

        loss = sum([
            criterion(out_thesis, labels[:, 0]),
            criterion(out_context, labels[:, 1]),
            criterion(out_evidence, labels[:, 2]),
            criterion(out_analysis, labels[:, 3])
        ])

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc = [total_acc[i] + compute_accuracy(out, labels[:, i]) for i, out in enumerate([out_thesis, out_context, out_evidence, out_analysis])]

    print(f"\nEpoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")
    print(f"Train Accuracy: {['{:.2%}'.format(a/len(train_loader)) for a in total_acc]}")

# 6. Feedback Generation Model Prep
t5_df = []
feedback_tokenizer = T5Tokenizer.from_pretrained('t5-small')

rubrics = [
    ("Thesis", "Score-Thesis (0-1)", "Feedback-Thesis"),
    ("Contextualization", "Score-Contextualization (0-1)", "Feedback-Contextualization"),
    ("Evidence", "Score-Evidence (0-2)", "Feedback-Evidence"),
    ("Analysis", "Score-Analysis and Reasoning (0-2)", "Feedback-Analysis")
]

for _, row in df.iterrows():
    for rubric, score_col, feedback_col in rubrics:
        if pd.notna(row[score_col]) and pd.notna(row[feedback_col]):
            prompt = f"Generate feedback for {rubric}. Question: {row['Question']} Response: {row['Response']} Score: {row[score_col]}"
            t5_df.append((prompt, row[feedback_col]))

fb_df = pd.DataFrame(t5_df, columns=["input_text", "target_text"])
train_fb, val_fb = train_test_split(fb_df, test_size=0.2, random_state=42)

class T5FeedbackDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.inputs = df['input_text'].tolist()
        self.targets = df['target_text'].tolist()
        self.tokenizer = tokenizer

    def __len__(self): return len(self.inputs)

    def __getitem__(self, idx):
        input_enc = self.tokenizer(self.inputs[idx], max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        target_enc = self.tokenizer(self.targets[idx], max_length=128, truncation=True, padding='max_length', return_tensors='pt')
        return {
            'input_ids': input_enc['input_ids'].squeeze(),
            'attention_mask': input_enc['attention_mask'].squeeze(),
            'labels': target_enc['input_ids'].squeeze()
        }

train_fb_ds = T5FeedbackDataset(train_fb, feedback_tokenizer)
val_fb_ds = T5FeedbackDataset(val_fb, feedback_tokenizer)
train_fb_loader = DataLoader(train_fb_ds, batch_size=4, shuffle=True)
val_fb_loader = DataLoader(val_fb_ds, batch_size=4)

model_feedback = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
optimizer = AdamW(model_feedback.parameters(), lr=3e-4)

def train_t5_epoch(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        inputs = {k: v.to(device) for k, v in batch.items()}
        inputs['labels'][inputs['labels'] == feedback_tokenizer.pad_token_id] = -100
        loss = model(**inputs).loss
        loss.backward()
        optimizer.step(); optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_t5(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            inputs['labels'][inputs['labels'] == feedback_tokenizer.pad_token_id] = -100
            total_loss += model(**inputs).loss.item()
    return total_loss / len(dataloader)

for epoch in range(15):
    train_loss = train_t5_epoch(model_feedback, train_fb_loader)
    val_loss = evaluate_t5(model_feedback, val_fb_loader)
    print(f"\nEpoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

# 7. Predict + Generate Feedback for New Samples
def predict_scores(text):
    model_classifier.eval()
    enc = scoring_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        o1, o2, o3, o4 = model_classifier(**enc)
    return {
        "Thesis": o1.argmax(dim=1).item(),
        "Contextualization": o2.argmax(dim=1).item(),
        "Evidence": o3.argmax(dim=1).item(),
        "Analysis": o4.argmax(dim=1).item()
    }

def generate_feedback_all(question, response, scores):
    model_feedback.eval()
    out = {}
    for rubric, score in scores.items():
        prompt = f"Give detailed feedback on the student's {rubric}. Question: {question} Response: {response} Score: {score}"
        inputs = feedback_tokenizer(prompt, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            output = model_feedback.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=256, num_beams=4, early_stopping=True
            )
        out[rubric] = feedback_tokenizer.decode(output[0], skip_special_tokens=True)
    return out

# Example evaluation
test_data = [
    {
        "Question": "Evaluate the extent to which the growth of transatlantic trade changed British North American colonial society from 1607 to 1776.",
        "Response": """Since the beginning of the Era of Exploration, European nations had been funding groundbreaking navigation trips to never-before-explored areas, developing colonies, and establishing trade. Explorers such as Vasco De Gama, Ferdinand Magellen, Christopher Columbus were funded by monarchs such as Isabella of Spain and aided by technology such as carvel ships and astrolabes. Once landed, nations quickly established colonies, with those such as Spanish conquistadors using immense force to combat Native populations. The unique resources and immense wealth brought about by these colonies were exchanged throughout trade routes, one of the most notable being the transatlantic trade route. Although it furthered the British North American colonies' ties to England, transatlantic trade succeeded in drastically changing the British North American colonial society to large extent by leading to the development of distinct regions throughout the colonies and leading to the dependence of Southern society on slave labor. To begin with, the transatlantic trade involved many of the British North American colonies, but, at each port, different items were exchanged, leading to the development of distinct economical regions between these colonies; these unique economies, in turn, shaped British American colonial society. For example, the Massachussetts Bay Colony in the northern region had originally been founded on the principle of religious tolerance (for select religions excluding Catholicism and Judaism). Located along the coast, this colony quickly became apart of transatlantic trade, which led to the formation of the colony's economy. The economy of the Massachussetts Bay Colony became based around fishing, but, more importantly, shipbuilding, as a merchant class evolved within this economy and sought to partake in the transatlantic trade. As a result, this colony was more reliant on commerce than agriculture, leading to tightly spaced and closer knit communities, allowing these societies to become more connected and reliant on the church. In the South, the colony of Virginia was originally founded upon the principles of "God, Gold, and Glory". However, after harsh seasons and the discovery of the absence of gold, the colonists grew discontent. But, the transatlantic slave trade gradually increased the wealth of Virginian colonists by encouraging these colonists to grow and trade crops, such as tobacco, a cash crop, in the profitable transatlantic trade. As a result, the Virginian economy became reliant on agriculture, a lot of land was dedicated to farming, and societies became more spread out and less tightly knit than Northern colonies. In addition to creating distinct regions, the transatlantic trade introduced Southern colonies to slave labor and gradually increased these colonies' dependence on slave labor over time. For example, in 1607, the first Africans arrived in Virginia, but it was a very small amount. More importantly, these Africans arrived as indentured servants, with freedom promised to them after a designated number of years of service. However, the development of the transatlantic trade made the South reliant on slaves not only by demanding more and more agricultural output from the South, but also by supplying the South with the slaves through the transatlantic slave trade. In the transatlantic slave trade, Europeans in Africa would kidnap Africans or bribe African tribes to kidnap rival tribe members put on ships destined for British North America. Once these ships reached the South, the slaves were auctioned off. The increase of slave labor in the South also led the creation of Southern plantations, a unique characteristic of Southern society that represented the South even after the transatlantic slave trade ended (this is evident in media such as Gone with the Wind). While the transatlantic trade may have created regions and developed slavery, it did act as a continuity in furthering the British North American colonies' ties with England. As a crucial part of transatlantic trade, the colonies sold much of their raw materials to consumers in England. The profits from this trade constituted a large part of the colonial economy and made the colonies dependent on England. For example, when the colonists were angry at the Tea Act, which gave the British East India Company a monopoly over the tea trade, colonists rebelled through the Boston Tea Party, in which vast amounts of expensive of tea were dumped into the Boston Harbor. The colonists retaliated in this way because they were angry that England had the ability to grant monopoly rights to the British East India Company but they also acknowledged that by ruining the tea, they were harming the crucial economic tie between the colonies in England. Considering this example and others, it begins to be revealed that the transatlantic trade actually did cause change in relation to the colonies and England in that it stirred political activism in colonial society by creating a tie so strong that the colonists felt no choice but to retaliate and break this economic bond with England as part of a way to obtain independence in their economies, politics, and society."""
    },
    {
        "Question": "Evaluate the extent to which the growth of transatlantic trade changed British North American colonial society from 1607 to 1776.",
        "Response": """The arrival of Columbus in the Americas had set off a larrge and immense movment of people from all the major coutnries of the Netherlands, Spain, France, and the British to North and South America. The British, however, was the the main powera as thir navy was the storngest amongst any of his other opponents, and had the wealth and resoures to be able to take the area of land East of North America to form its colonies. They had established the city of Jamestown, the first permanent colonial settlement, and were able to expand with the ability of their large military power to form a total of 13 colonies connectign each other through the Atlantic Ocean. From the period of 1607 to 1776, the growth of the transatlantic slave trade changed British North American colonial society as it started to bring an exahnge of large amoutns of goods and reosurces while also prioriting the idea of mercantilism in the colonies. However, the trade had also establsihed the use of slavery in the predominant Southern colonies whose effect would last for years to come even after American independcne. The transatlatic slave trade had establsihed a network of network of exchange of resources, however heavily implented the idea of self-profit through mercantilism. Resoruces such as tomatoes, corn, and, and turkey was introdued to the British society while bringing resoruces such as potatoes and horses to the colonies. They had estabished the ability to exploit colonists and people born in the Americans for their own will and profit thir home nation of Britain. Although the colonists also profits from this from of trade, they were being used by the British to work for them as they also constructed the social classes of people to tkae charge of the ability, strenght, and upsrings of the others. The form of trade had also idtrodcted slavery into the nation, to replace Idians for a form of indentured servitude. Slaves were purchsed form Africa and brought to the Americas to work in the SOuthern palntations set up in the solonies. As the colonies in the South had better climate, they were able to use a form of labor that would not have to ask for any form of pay or reward. This had forced the southern economy to be enrooted towards the idea of slavery and run its economy off the proficts made from agriculture and slavery. The pattern of southern slavery is continued all throughout decades all the way to the idea of sectionalism, nullification, and the Civil War as the slave-establsuhed states of the new country had started to separated due to the issue of slavery, contaaining it, and abolishing it."""
    },
    {
        "Question": "Evaluate the extent to which the growth of transatlantic trade changed British North American colonial society from 1607 to 1776.",
        "Response": """In 1492, Christopher Columbus had fled from Europe founded the Americas caused a chain of events where he led several countries in Europe including the Spanish, French, and British to colonize the land and have and exchange of goods called The Columbian Exchange of which allowed them to have access to goods from the Americas but as a result brought in diseases of which killed millions of Native Americans which only gave them access to more land and caused more migration over to the New World causing several colonies to settle down continusly traded with Europe with their new goods from the Americas which helped grow the transatlantic trade route. The growth of this route allowed for a lot of change in the British North American colonial society like helping them realize their want for representation and ending up independent from the British."""
    }
]


results = []
for row in test_data:
    text = row['Question'] + " [SEP] " + row['Response']
    scores = predict_scores(text)
    feedback = generate_feedback_all(row['Question'], row['Response'], scores)
    results.append({**row, **scores, **{f"{k} Feedback": v for k, v in feedback.items()}})

import pandas as pd

# Ensure full column content is shown
pd.set_option('display.max_colwidth', None)

# Then display the DataFrame
display(results)



Epoch 1 - Training: 100%|██████████| 2/2 [00:00<00:00,  8.70it/s]



Epoch 1 Train Loss: 3.6254
Train Accuracy: ['12.50%', '87.50%', '25.00%', '18.75%']


Epoch 2 - Training: 100%|██████████| 2/2 [00:00<00:00,  8.59it/s]



Epoch 2 Train Loss: 3.2241
Train Accuracy: ['87.50%', '87.50%', '75.00%', '62.50%']


Epoch 3 - Training: 100%|██████████| 2/2 [00:00<00:00,  8.62it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Epoch 3 Train Loss: 3.0783
Train Accuracy: ['87.50%', '87.50%', '31.25%', '68.75%']


Training: 100%|██████████| 10/10 [00:00<00:00, 10.41it/s]



Epoch 1: Train Loss = 3.7673, Val Loss = 2.8405


Training: 100%|██████████| 10/10 [00:00<00:00, 10.30it/s]



Epoch 2: Train Loss = 2.8680, Val Loss = 2.5153


Training: 100%|██████████| 10/10 [00:00<00:00, 10.28it/s]



Epoch 3: Train Loss = 2.4864, Val Loss = 2.2867


Training: 100%|██████████| 10/10 [00:00<00:00, 10.39it/s]



Epoch 4: Train Loss = 2.1772, Val Loss = 2.1276


Training: 100%|██████████| 10/10 [00:00<00:00, 10.34it/s]



Epoch 5: Train Loss = 1.9462, Val Loss = 2.0767


Training: 100%|██████████| 10/10 [00:00<00:00, 10.30it/s]



Epoch 6: Train Loss = 1.7935, Val Loss = 1.9958


Training: 100%|██████████| 10/10 [00:00<00:00, 10.27it/s]



Epoch 7: Train Loss = 1.6150, Val Loss = 1.9668


Training: 100%|██████████| 10/10 [00:00<00:00, 10.29it/s]



Epoch 8: Train Loss = 1.4484, Val Loss = 1.9290


Training: 100%|██████████| 10/10 [00:00<00:00, 10.21it/s]



Epoch 9: Train Loss = 1.3739, Val Loss = 1.9387


Training: 100%|██████████| 10/10 [00:00<00:00, 10.19it/s]



Epoch 10: Train Loss = 1.2720, Val Loss = 1.9328


Training: 100%|██████████| 10/10 [00:00<00:00, 10.29it/s]



Epoch 11: Train Loss = 1.1895, Val Loss = 1.9517


Training: 100%|██████████| 10/10 [00:00<00:00, 10.28it/s]



Epoch 12: Train Loss = 1.0900, Val Loss = 1.9325


Training: 100%|██████████| 10/10 [00:00<00:00, 10.33it/s]



Epoch 13: Train Loss = 1.0211, Val Loss = 1.9598


Training: 100%|██████████| 10/10 [00:00<00:00, 10.28it/s]



Epoch 14: Train Loss = 0.9158, Val Loss = 1.9788


Training: 100%|██████████| 10/10 [00:00<00:00, 10.25it/s]



Epoch 15: Train Loss = 0.8233, Val Loss = 2.0293


[{'Question': 'Evaluate the extent to which the growth of transatlantic trade changed British North American colonial society from 1607 to 1776.',
  'Response': 'Since the beginning of the Era of Exploration, European nations had been funding groundbreaking navigation trips to never-before-explored areas, developing colonies, and establishing trade. Explorers such as Vasco De Gama, Ferdinand Magellen, Christopher Columbus were funded by monarchs such as Isabella of Spain and aided by technology such as carvel ships and astrolabes. Once landed, nations quickly established colonies, with those such as Spanish conquistadors using immense force to combat Native populations. The unique resources and immense wealth brought about by these colonies were exchanged throughout trade routes, one of the most notable being the transatlantic trade route. Although it furthered the British North American colonies\' ties to England, transatlantic trade succeeded in drastically changing the British North