[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/USER/REPO/blob/main/semma_project/notebooks/SEMMA_Student_Performance.ipynb)

# SEMMA: Student Performance

> Sample → Explore → Modify → Model → Assess

**Dataset**: Kaggle — Student Performance.

**Goal**: Predict GPA / pass‑fail and explain key drivers.

In [None]:
!pip -q install pandas numpy scikit-learn matplotlib seaborn xgboost lightgbm imbalanced-learn joblib gradio fastapi uvicorn kaggle

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score, classification_report, confusion_matrix
import joblib
Path('data').mkdir(exist_ok=True)
Path('models').mkdir(exist_ok=True)
Path('reports').mkdir(exist_ok=True)

In [None]:
# --- Kaggle download (make sure you've set up your Kaggle token) ---
    # In Colab:
    # from google.colab import files
    # files.upload()  # upload kaggle.json
    # !mkdir -p ~/.kaggle
    # !cp kaggle.json ~/.kaggle/
    # !chmod 600 ~/.kaggle/kaggle.json
    !kaggle datasets download -d bhuvaneshwarisa/student-performance-dataset -p data
    !ls -lh data
    # If a zip is downloaded, unzip it:
    !python - << 'PY'
import zipfile, glob, os
zips = glob.glob('data/*.zip')
for z in zips:
    with zipfile.ZipFile(z) as f:
        f.extractall('data')
        print('unzipped', z)
PY

## 1. Sample
- Define population & sampling strategy
- Train/validation/test split; stratify as needed

In [None]:
# Load & sample
import pandas as pd
from sklearn.model_selection import train_test_split

files = list(Path('data').glob('*.csv'))
print('Data files:', files)
df = None
for f in files:
    if 'student' in f.name.lower():
        df = pd.read_csv(f)
        break
print(df.shape if df is not None else 'Not found')

if df is not None:
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    train.to_csv('data/train.csv', index=False)
    test.to_csv('data/test.csv', index=False)

## 2. Explore
- Descriptives, correlations, distributions
- Target leakage screening

In [None]:
if df is not None:
    display(df.describe(include='all').T)
    df.corr(numeric_only=True).style.background_gradient()

## 3. Modify
- Imputation, encoding, transformations
- Feature selection/creation

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

if df is not None:
    target = 'GPA' if 'GPA' in df.columns else df.columns[-1]
    X = df.drop(columns=[target])
    y = df[target]
    num_cols = X.select_dtypes(include=np.number).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    pre = ColumnTransformer([
        ('num', Pipeline([('impute', SimpleImputer()), ('scale', StandardScaler())]), num_cols),
        ('cat', Pipeline([('impute', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), cat_cols)
    ])

## 4. Model
- Compare algorithms; document tuning
- Explainability

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error

# Example: regression
if df is not None:
    reg = Pipeline([('pre', pre), ('model', LinearRegression())])
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
    reg.fit(X_tr, y_tr)
    preds = reg.predict(X_te)
    print('RMSE:', mean_squared_error(y_te, preds, squared=False))
    joblib.dump(reg, 'models/model.joblib')

## 5. Assess
- Holdout/bootstraps, error analysis
- Business implications & next steps