In [None]:

import pandas as pd

file_path = "/content/activity5_essays.csv"
df = pd.read_csv(file_path)

print("✅ Dataset loaded successfully.\n")
print("Columns:", df.columns.tolist())
print("\nSample rows:\n", df.head(3))

print("\nTotal essays:", len(df))
print("Missing feedback entries:", df['feedback'].isna().sum())


In [None]:
!pip install -q transformers torch

import pandas as pd
from transformers import AutoTokenizer

file_path = "/content/activity5_essays.csv"
df = pd.read_csv(file_path)

df = df[['essay_text', 'score']].dropna()
df['essay_text'] = df['essay_text'].astype(str)
df['score'] = df['score'].astype(float)

print("Cleaned dataset shape:", df.shape)
print("Example essay text:\n", df['essay_text'].iloc[0][:300], "\n")
print("Score:", df['score'].iloc[0])

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokens = tokenizer(
    df['essay_text'].tolist(),
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

print("\nTokenization complete.")
print("Tokenized tensor shapes:")
for k, v in tokens.items():
    print(f"{k}: {v.shape}")


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

class EssayDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(float(self.labels[idx]))
        return item
    def __len__(self):
        return len(self.labels)

texts = df["essay_text"].tolist()
labels = df["score"].tolist()

train_dataset = EssayDataset(texts, labels, tokenizer)

training_args = TrainingArguments(
    output_dir="./essay_model",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

sample_essay = df["essay_text"].iloc[0]
summary = summarizer(sample_essay, max_length=80, min_length=30, do_sample=False)[0]["summary_text"]

print("Essay:\n", sample_essay[:400], "...")
print("\nGenerated Feedback:\n", summary)


In [None]:
from transformers import pipeline

scorer = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=False, function_to_apply="sigmoid")

essay_input = """Education is the foundation for individual and social development.
It empowers people with knowledge, fosters creativity, and shapes a better society."""

score = scorer(essay_input)[0]["score"]
summary = summarizer(essay_input, max_length=60, min_length=20, do_sample=False)[0]["summary_text"]

print("Predicted Essay Score:", round(score * 10, 2))
print("\nGenerated Feedback:\n", summary)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.histplot(df["score"], bins=10, kde=True)
plt.title("Essay Score Distribution")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.show()
