Here loading curriculum + probes, simulate students, generate interaction logs will be done.

In [1]:
# load the knowledge base
import pandas as pd
import numpy as np

concepts = pd.read_csv("../data/concepts.csv")
prereqs = pd.read_csv("../data/prerequisites.csv")
questions = pd.read_csv("../data/questions.csv")
qc_map = pd.read_csv("../data/question_concept_map.csv")

Define Simulation Parameters

In [2]:
NUM_STUDENTS = 40
ATTEMPTS_PER_STUDENT = 80

BASE_MASTERY = 0.2
LEARNING_RATE = 0.15
SLIP_PROB = 0.1
GUESS_PROB = 0.2

Interpretation:
    Students start weak
    Learn gradually
    Sometimes make mistakes
    Sometimes guess correctly

In [3]:
# initialize student mastery states

students = {}

for s in range(NUM_STUDENTS):
    mastery = {
        c: BASE_MASTERY + np.random.normal(0, 0.05)
        for c in concepts["concept_id"]
    }
    students[f"S{s+1}"] = mastery

Helper: prerequisite penalty

In [4]:
# if prerequisites aren't mastered, performance drops:

def prereq_penalty(concept, mastery, prereqs):
    prereq_concepts = prereqs[prereqs["dependent"] == concept]["prerequisite"]
    if prereq_concepts.empty:
        return 1.0
    return np.mean([mastery[p] for p in prereq_concepts])


Simulate interactions

In [5]:
interactions = []

for student_id, mastery in students.items():
    for t in range(ATTEMPTS_PER_STUDENT):
        q = questions.sample(1).iloc[0]
        concept = qc_map[qc_map["question_id"] == q["question_id"]]["concept_id"].values[0]

        prereq_factor = prereq_penalty(concept, mastery, prereqs)
        p_correct = mastery[concept] * prereq_factor

        # Apply slip and guess
        if np.random.rand() < SLIP_PROB:
            correct = 0
        elif np.random.rand() < GUESS_PROB:
            correct = 1
        else:
            correct = int(np.random.rand() < p_correct)

        # Learning update
        if correct:
            mastery[concept] = min(1.0, mastery[concept] + LEARNING_RATE)

        interactions.append(
            (student_id, q["question_id"], concept, t, correct)
        )

Create and export the dataset

In [6]:
interactions_df = pd.DataFrame(
    interactions,
    columns=["student_id", "question_id", "concept_id", "time_step", "correct"]
)

interactions_df.to_csv("../data/interactions.csv", index=False)
interactions_df.head()


Unnamed: 0,student_id,question_id,concept_id,time_step,correct
0,S1,Q27,C7,0,1
1,S1,Q8,C2,1,0
2,S1,Q7,C2,2,0
3,S1,Q37,C10,3,0
4,S1,Q2,C1,4,1


Now a realistic learner interaction dataset is created with prerequisite effects, with learning over time and with noise.
this dataset is suitable for CBM, BKT and DKT. Also, all the models see the same data.

In [None]:
len(interactions_df)

In [8]:
interactions_df["correct"].mean()

np.float64(0.355)