In [None]:
!pip install transformers datasets evaluate accelerate



In [None]:
from datasets import load_dataset
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
!pip install evaluate
import evaluate



In [None]:
print(transformers.__version__)
print(transformers.__file__)

4.56.0
/usr/local/lib/python3.12/dist-packages/transformers/__init__.py


In [None]:
dataset = load_dataset("nkazi/SciEntsBank")

# use UA split (unseen answers)
train_ds = dataset["train"]
test_ds = dataset["test_ua"]

In [None]:
# model_name = "distilbert-base-uncased"
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name) # tokenization 把句子切成单元词

In [None]:
# Preprocess function: combine question + student_answer
def preprocess(data):
    # 把文本转换成了 input_ids, attention_mask, token_type_ids
    return tokenizer(
        data["question"],
        data["student_answer"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

encoded_train = train_ds.map(preprocess, batched=True) # 文字转成数字的token tokenization + encoding（把token词转成数字IDs）
encoded_test = test_ds.map(preprocess, batched=True)

Map:   0%|          | 0/540 [00:00<?, ? examples/s]

In [None]:
# define model
num_labels = 5   # correct, wrong, partially_correct, irrelevant, contradictory
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# metric
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels),
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro"),
        "f1_weighted": f1.compute(predictions=preds, references=labels, average="weighted"),
    }

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
save_path = "/content/drive/MyDrive/Misconception_Analysis/sci_bert_model"



In [None]:
args = TrainingArguments(
    output_dir=save_path,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    save_steps=500,                  # 每500步保存一次
    save_total_limit=2,              # 只保留最近2个checkpoint
    eval_steps=500,                  # 每500步做一次eval
    logging_steps=100,               # 每100步打印一次日志
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)


In [None]:
# trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_train,
    eval_dataset=encoded_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
#Train & Evaluate
trainer.train()
results = trainer.evaluate()
print(results)



Step,Training Loss
100,1.3511
200,1.206
300,1.1447
400,0.9998
500,0.9782
600,0.931
700,0.8249
800,0.7904
900,0.7847




{'eval_loss': 1.0934464931488037, 'eval_accuracy': {'accuracy': 0.5370370370370371}, 'eval_f1_macro': {'f1': 0.3751773338853762}, 'eval_f1_weighted': {'f1': 0.5293392570382075}, 'eval_runtime': 104.1884, 'eval_samples_per_second': 5.183, 'eval_steps_per_second': 0.326, 'epoch': 3.0}


In [None]:

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/Misconception_Analysis/sci_bert_model/tokenizer_config.json',
 '/content/drive/MyDrive/Misconception_Analysis/sci_bert_model/special_tokens_map.json',
 '/content/drive/MyDrive/Misconception_Analysis/sci_bert_model/vocab.txt',
 '/content/drive/MyDrive/Misconception_Analysis/sci_bert_model/added_tokens.json',
 '/content/drive/MyDrive/Misconception_Analysis/sci_bert_model/tokenizer.json')

Model Evaluation and Analysis

The bert-base-uncased model was evaluated on the SciEntsBank dataset using accuracy and F1 scores. The results after three training epochs are summarized as follows:

Evaluation Loss: 1.09

Accuracy: 53.7%

Macro F1: 0.375

Weighted F1: 0.529

These results suggest that the model performs substantially better than random guessing (baseline ≈ 20% with five classes), indicating that it has learned meaningful patterns in student responses. However, there are clear differences between the metrics:

Accuracy vs. Macro F1: While overall accuracy is moderate (53.7%), the lower macro F1 score (0.375) reveals that the model struggles with minority classes, such as irrelevant or contradictory responses. This reflects the class imbalance in the dataset.

Weighted F1: The weighted F1 score (0.529) is closer to accuracy, suggesting that performance is stronger on the more frequent classes (correct, wrong) but weaker on the less represented ones.

From an educational perspective, this indicates that the model is relatively reliable at identifying clearly correct or incorrect responses but has more difficulty distinguishing categories such as partially correct and contradictory. Improving performance on these minority categories is critical for building systems that can effectively diagnose student misconceptions.

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer

# model = AutoModelForSequenceClassification.from_pretrained(
#     "/content/drive/MyDrive/Misconception_Analysis/sci_bert_model"
# )
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=encoded_train,
#     eval_dataset=encoded_test,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# trainer.train(resume_from_checkpoint=True)
