In [1]:
# --- Drive mount (safe to run locally; Colab will prompt to auth) ---
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
    IN_COLAB = True
except Exception:
    IN_COLAB = False

from pathlib import Path
import os, sys

# ---- Set your dataset path here ----
# If you're in Colab and the file is in MyDrive:
# DATA_FILE = Path("/content/drive/MyDrive/StudentsPerformance.csv")
# If it's in the Colab working dir:
# DATA_FILE = Path("/content/StudentsPerformance.csv")
# If running locally:
# DATA_FILE = Path("C:/Users/<you>/Downloads/StudentsPerformance.csv")
DATA_FILE = Path("/content/drive/MyDrive/StudentsPerformance.csv")  # <-- change if needed

assert DATA_FILE.exists(), f"File not found: {DATA_FILE}"
print("Using data:", DATA_FILE)


Mounted at /content/drive
Using data: /content/drive/MyDrive/StudentsPerformance.csv


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(DATA_FILE)
print(df.shape)
display(df.head())
display(df.dtypes)

# Create a composite score and a binary target
df["avg_score"] = df[["math score", "reading score", "writing score"]].mean(axis=1)
# Define "High performer" as avg >= 70 (tweak threshold to taste)
df["High_Performer"] = (df["avg_score"] >= 70).astype(int)
print(df["High_Performer"].value_counts(normalize=True).rename("class balance"))


(1000, 8)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Unnamed: 0,0
gender,object
race/ethnicity,object
parental level of education,object
lunch,object
test preparation course,object
math score,int64
reading score,int64
writing score,int64


High_Performer
0    0.541
1    0.459
Name: class balance, dtype: float64


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

TARGET_COL = "High_Performer"
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL, "avg_score"])  # keep raw subject scores as features if you like

# Light numeric coercion (e.g., if any numeric columns are typed as object)
def coerce_numeric(frame: pd.DataFrame) -> pd.DataFrame:
    out = frame.copy()
    for c in out.columns:
        if out[c].dtype == "object":
            # try converting to numeric where possible (won't change non-numeric)
            out[c] = pd.to_numeric(out[c], errors="ignore")
    return out

X = coerce_numeric(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

len(X_train), len(X_test), len(num_cols), len(cat_cols)


  out[c] = pd.to_numeric(out[c], errors="ignore")


(800, 200, 3, 5)