In [1]:
import pandas as pd
import numpy as np

num_students = 5000
subjects = [
    "Algorithm_&_Program_Design", "Database_Management_Systems", "Programming_with_Java",
    "Digital_Logic_&_Computer_Architecture", "Software_Engineering", "Theoretical_Computer_Science",
    "Advanced_Data_Structures", "Data_Communication_&_Networks", "Operating_Systems_&_Shell_Programming",
    "AI_&_ML", "Software_Testing_&_QA", "Analysis_&_Design_of_Algorithms",
    "Data_Mining_&_Warehousing", "Information_&_Cybersecurity", "Cloud_Computing"
]

columns = ["student_id", "gender", "archetype"]
for sub in subjects:
    columns.extend([f"{sub}_Minor1", f"{sub}_Minor2", f"{sub}_EndSem"])
for sub in subjects:
    columns.append(f"{sub}_IsWeak")

np.random.seed(42)
rows = []

def mark_weak_label(m1, m2, es, archetype):
    if archetype == "forced_weak":
        return 1
    if archetype == "highachiever":
        return 0
    # Rule a: Consistently low minors
    if m1 < 12 and m2 < 12:
        return 1
    # Rule b: High variance in minors
    if abs(m1 - m2) > 8:
        return 1
    # Rule c: Near threshold + one minor very low
    if (m1 + m2 < 28) and (m1 < 10 or m2 < 10):
        return 1
    # Rule d: Surprise recovery in endsem
    if m1 < 10 and m2 < 10 and es > 55:
        return 0
    # Default: Not weak
    return 0

for i in range(num_students):
    student_id = f"MCA23_{i+1:04d}"
    gender = np.random.choice(["Male", "Female"])
    archetype = np.random.choice(
        ["highachiever", "average", "struggling", "specializedai", "forced_weak"],
        p=[0.12, 0.40, 0.13, 0.15, 0.20]
    )

    row_data = [student_id, gender, archetype]
    marks = []
    weak_labels = []

    for idx, sub in enumerate(subjects):
        if archetype == "highachiever":
            m1 = int(np.clip(np.random.normal(17, 2), 15, 20))
            m2 = int(np.clip(np.random.normal(16, 2), 14, 20))
            es = int(np.clip(np.random.normal(54, 6), 45, 60))
        elif archetype == "struggling":
            m1 = int(np.clip(np.random.normal(7, 3), 0, 12))
            m2 = int(np.clip(np.random.normal(6, 3), 0, 12))
            es = int(np.clip(np.random.normal(22, 7), 0, 35))
        elif archetype == "forced_weak":
            m1 = int(np.clip(np.random.normal(5, 3), 0, 10))
            m2 = int(np.clip(np.random.normal(5, 3), 0, 10))
            es = int(np.clip(np.random.normal(20, 10), 0, 40))
        elif archetype == "specializedai" and ("AI" in sub or "Data" in sub or "Cloud" in sub):
            m1 = int(np.clip(np.random.normal(18, 1), 15, 20))
            m2 = int(np.clip(np.random.normal(17, 2), 14, 20))
            es = int(np.clip(np.random.normal(56, 4), 48, 60))
        else:
            m1 = int(np.clip(np.random.normal(13, 4), 5, 20))
            m2 = int(np.clip(np.random.normal(14, 4), 5, 20))
            es = int(np.clip(np.random.normal(40, 10), 20, 60))
        marks.extend([m1, m2, es])
        # Weak label
        weak_labels.append(mark_weak_label(m1, m2, es, archetype))

    # Mask random subjects for partial entry simulation
    mask_prob = 0.4
    for sj in range(len(subjects)):
        if np.random.rand() < mask_prob:
            marks[sj*3] = 0
            marks[sj*3+1] = 0
            marks[sj*3+2] = 0

    row_data.extend(marks)
    row_data.extend(weak_labels)
    rows.append(row_data)

df = pd.DataFrame(rows, columns=columns)
df.to_csv("MCA_Student_Performance_Balanced.csv", index=False)