In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import json

# Paths
DATA = Path("../data/results.csv")   # your input CSV (we’ll create one if needed)
OUT  = Path("../outputs")            # folder for charts/summaries
OUT.mkdir(parents=True, exist_ok=True)


In [2]:
df = pd.read_csv(DATA)
df.head(), df.info(), df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   learner_id  50 non-null     object
 1   module      50 non-null     object
 2   screen      50 non-null     object
 3   completed   50 non-null     bool  
 4   quiz_score  50 non-null     int64 
 5   timestamp   50 non-null     object
dtypes: bool(1), int64(1), object(4)
memory usage: 2.1+ KB


(  learner_id module screen  completed  quiz_score         timestamp
 0         u1  Intro     s1      False          40  2025-08-01 09:00
 1         u2  Intro     s2       True          75  2025-08-01 09:05
 2         u3  Intro     s3       True          82  2025-08-01 09:15
 3         u4  Intro     s2      False          55  2025-08-01 09:20
 4         u5  Intro     s4       True          90  2025-08-01 09:30,
 None,
 learner_id    0
 module        0
 screen        0
 completed     0
 quiz_score    0
 timestamp     0
 dtype: int64)

In [3]:
df.columns = df.columns.str.strip().str.lower()
df["completed"]  = df["completed"].astype(bool)
df["quiz_score"] = pd.to_numeric(df["quiz_score"], errors="coerce")

In [4]:
completion_rate = df["completed"].mean()
avg_quiz_score  = df["quiz_score"].mean()
dropoff_screen  = (df.loc[~df["completed"], "screen"].value_counts().idxmax()
                   if (~df["completed"]).any() else None)

kpis = {
    "completion_rate": round(float(completion_rate), 3),
    "average_quiz_score": round(float(avg_quiz_score), 2),
    "top_dropoff_screen": dropoff_screen
}
kpis

{'completion_rate': 0.7,
 'average_quiz_score': 72.84,
 'top_dropoff_screen': 's2'}

In [5]:
# completion by module
module_completion = df.groupby("module")["completed"].mean().sort_values(ascending=False)

plt.figure(); module_completion.plot(kind="bar")
plt.title("Completion Rate by Module"); plt.xlabel("Module"); plt.ylabel("Completion Rate")
plt.tight_layout(); plt.savefig(OUT/"completion_by_module.png", dpi=150); plt.close()

# quiz score distribution
plt.figure(); df["quiz_score"].dropna().plot(kind="hist", bins=10)
plt.title("Quiz Score Distribution"); plt.xlabel("Quiz Score"); plt.ylabel("Count")
plt.tight_layout(); plt.savefig(OUT/"quiz_score_hist.png", dpi=150); plt.close()

In [6]:
median_quiz = float(df["quiz_score"].median())
std_quiz    = float(df["quiz_score"].std(ddof=1)) 

kpis.update({
    "median_quiz_score": round(median_quiz, 1),
    "std_quiz_score": round(std_quiz, 2),
})
kpis

{'completion_rate': 0.7,
 'average_quiz_score': 72.84,
 'top_dropoff_screen': 's2',
 'median_quiz_score': 77.5,
 'std_quiz_score': 17.35}

In [7]:
md = f"""# Course Analytics Summary

**Completion rate:** {kpis['completion_rate']*100:.1f}%  
**Average quiz score:** {kpis['average_quiz_score']:.1f}  
**Median quiz score:** {kpis['median_quiz_score']:.1f}  
**Std dev (quiz):** {kpis['std_quiz_score']:.2f}  
**Top drop-off screen:** {kpis['top_dropoff_screen']}
"""
(OUT/"summary.md").write_text(md, encoding="utf-8")
"Updated"

'Updated'

In [8]:
from scipy import stats

mods = df["module"].dropna().unique()
ttest_result = None

if len(mods) >= 2:
    a, b = mods[:2]
    a_scores = df.loc[df["module"]==a, "quiz_score"].dropna()
    b_scores = df.loc[df["module"]==b, "quiz_score"].dropna()
    if len(a_scores) > 1 and len(b_scores) > 1:
        t, p = stats.ttest_ind(a_scores, b_scores, equal_var=False)
        ttest_result = {"modules":[a,b], "t_stat": float(t), "p_value": float(p)}

kpis["ttest_quiz_by_module"] = ttest_result
kpis


{'completion_rate': 0.7,
 'average_quiz_score': 72.84,
 'top_dropoff_screen': 's2',
 'median_quiz_score': 77.5,
 'std_quiz_score': 17.35,
 'ttest_quiz_by_module': {'modules': ['Intro', 'ModuleA'],
  't_stat': -0.34830995826198524,
  'p_value': 0.7306442212906292}}

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# simple features (avoid leakage: don’t include quiz_score)
df["screen_idx"] = df["screen"].astype(str).str.extract(r"(\d+)", expand=False).astype(float)
df["timestamp"]  = pd.to_datetime(df["timestamp"], errors="coerce")
df["hour"]       = df["timestamp"].dt.hour
df["weekday"]    = df["timestamp"].dt.weekday

X_cats = pd.get_dummies(df["module"], prefix="module", dummy_na=True)
features = pd.concat([df[["screen_idx","hour","weekday"]], X_cats], axis=1).fillna(0)
target   = df["completed"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, stratify=target, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob) if len(y_test.unique())==2 else None

kpis.update({
    "ml_completion_accuracy": round(float(acc),3),
    "ml_completion_auc": round(float(auc),3) if auc is not None else None
})
kpis


{'completion_rate': 0.7,
 'average_quiz_score': 72.84,
 'top_dropoff_screen': 's2',
 'median_quiz_score': 77.5,
 'std_quiz_score': 17.35,
 'ttest_quiz_by_module': {'modules': ['Intro', 'ModuleA'],
  't_stat': -0.34830995826198524,
  'p_value': 0.7306442212906292},
 'ml_completion_accuracy': 0.667,
 'ml_completion_auc': 0.3}

In [10]:
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plt.imshow(cm, interpolation="nearest")
plt.title("Completion Classifier — Confusion Matrix")
plt.xticks([0,1], ["Pred 0","Pred 1"]); plt.yticks([0,1], ["True 0","True 1"])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.tight_layout(); plt.savefig(OUT/"cm_completion.png", dpi=150); plt.close()


In [11]:
md = f"""# Course Analytics Summary

## Statistics
- Completion rate: {kpis['completion_rate']*100:.1f}%
- Average quiz score: {kpis['average_quiz_score']:.1f}
- Median quiz score: {kpis['median_quiz_score']:.1f}
- Std dev (quiz): {kpis['std_quiz_score']:.2f}
- Top drop-off screen: {kpis['top_dropoff_screen']}
- t-test (first two modules): {kpis['ttest_quiz_by_module']}

## ML (demo)
- Model: Logistic Regression for completion (features: screen_idx, hour, weekday, module dummies)
- Accuracy: {kpis.get('ml_completion_accuracy')}
- AUC: {kpis.get('ml_completion_auc')}

## Notes & Caveats
- Dataset is small and synthetic — results are illustrative, not production-ready.
- Next steps: add richer features (e.g. time-on-task, attempts), try cross-validation, and tune hyperparameters.
"""
(OUT/"summary.md").write_text(md, encoding="utf-8")
"Updated"


'Updated'