In [2]:
!pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


Collecting pandas
  Downloading pandas-2.3.1-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.3.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.1-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   --------------------------------------

In [3]:
# Load dataset
df = pd.read_csv("sample_student_dataset.csv")

# Preview first few rows
df.head()


Unnamed: 0,StudentID,Attendance,InternalMarks,StudyHoursPerWeek,Participation,Extracurricular,FinalResult
0,S001,90,69,9,Yes,Yes,Fail
1,S002,57,64,13,No,No,Pass
2,S003,51,57,3,No,No,Fail
3,S004,97,99,9,Yes,Yes,Pass
4,S005,67,80,15,Yes,No,Pass


In [4]:
# Check for missing values
print(df.isnull().sum())

# Drop or fill missing values if any (for this sample dataset, it's clean)
# df.dropna(inplace=True)  # if you want to drop rows with missing values
# df.fillna(method="ffill", inplace=True)  # or forward fill


StudentID            0
Attendance           0
InternalMarks        0
StudyHoursPerWeek    0
Participation        0
Extracurricular      0
FinalResult          0
dtype: int64


In [5]:
# Encode Participation and Extracurricular columns (Yes/No → 1/0)
df['Participation'] = df['Participation'].map({'Yes': 1, 'No': 0})
df['Extracurricular'] = df['Extracurricular'].map({'Yes': 1, 'No': 0})

# Encode FinalResult (Pass/Fail) as target
label_encoder = LabelEncoder()
df['FinalResult'] = label_encoder.fit_transform(df['FinalResult'])  # Pass = 1, Fail = 0


In [6]:
X = df.drop(columns=["StudentID", "FinalResult"])
y = df["FinalResult"]


In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame (optional, useful for analysis)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df.head()


Unnamed: 0,Attendance,InternalMarks,StudyHoursPerWeek,Participation,Extracurricular
0,1.088821,-0.102682,0.241481,0.886405,0.960769
1,-1.108799,-0.392745,1.119595,-1.128152,-1.040833
2,-1.508367,-0.798832,-1.07569,-1.128152,-1.040833
3,1.554983,1.637693,0.241481,0.886405,0.960769
4,-0.442854,0.535455,1.558652,0.886405,-1.040833


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.50      0.55        12
           1       0.40      0.50      0.44         8

    accuracy                           0.50        20
   macro avg       0.50      0.50      0.49        20
weighted avg       0.52      0.50      0.51        20

Confusion Matrix:
 [[6 6]
 [4 4]]


In [10]:
import joblib

# Save model
joblib.dump(model, "student_model.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']