In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load dataset
df = pd.read_csv("Students_Grading_Dataset.csv")

# Check column names
print(df.columns)

# Select correct column names from the dataset
features = ["Attendance (%)", "Midterm_Score", "Final_Score", "Assignments_Avg", 
            "Study_Hours_per_Week", "Stress_Level (1-10)", "Sleep_Hours_per_Night"]
target = "Grade"  # Assuming the grade column exists

# Check if columns exist in dataset
for col in features + [target]:
    if col not in df.columns:
        print(f"❌ ERROR: Column '{col}' not found in dataset!")

# Drop missing values (if any)
df = df.dropna()

# Convert grades (A, B, C, etc.) to numeric labels
label_encoder = LabelEncoder()
df[target] = label_encoder.fit_transform(df[target])

# Save the label encoder for later use in Flask API
joblib.dump(label_encoder, "label_encoder.pkl")

# Extract input (X) and output (y)
X = df[features]
y = df[target]

# Normalize input features for better learning
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for later use
joblib.dump(scaler, "scaler.pkl")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Index(['Student_ID', 'First_Name', 'Last_Name', 'Email', 'Gender', 'Age',
       'Department', 'Attendance (%)', 'Midterm_Score', 'Final_Score',
       'Assignments_Avg', 'Quizzes_Avg', 'Participation_Score',
       'Projects_Score', 'Total_Score', 'Grade', 'Study_Hours_per_Week',
       'Extracurricular_Activities', 'Internet_Access_at_Home',
       'Parent_Education_Level', 'Family_Income_Level', 'Stress_Level (1-10)',
       'Sleep_Hours_per_Night'],
      dtype='object')


In [61]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Try both models and compare
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

# Define hyperparameters for tuning
param_grid = {
    "RandomForest": {"n_estimators": [100, 200], "max_depth": [10, 20, None]},
    "GradientBoosting": {"n_estimators": [100, 200], "learning_rate": [0.05, 0.1, 0.2]}
}

best_model = None
best_accuracy = 0

# Perform Grid Search to find the best model
for model_name, model in models.items():
    print(f"🔍 Tuning {model_name}...")
    grid = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)

    print(f"🔹 Best Params for {model_name}: {grid.best_params_}")
    acc = grid.best_score_

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = grid.best_estimator_

# Save the best model
joblib.dump(best_model, "student_grade_model.pkl")

# Evaluate final model on test data
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"🔥 Final Model Accuracy: {accuracy * 100:.2f}%")
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))


🔍 Tuning RandomForest...
🔹 Best Params for RandomForest: {'max_depth': 10, 'n_estimators': 100}
🔍 Tuning GradientBoosting...
🔹 Best Params for GradientBoosting: {'learning_rate': 0.05, 'n_estimators': 100}
🔥 Final Model Accuracy: 41.01%

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.73      0.68       171
           1       0.07      0.03      0.04        93
           2       0.26      0.26      0.26        81
           3       0.36      0.68      0.47        81
           4       0.20      0.10      0.13        91

    accuracy                           0.41       517
   macro avg       0.30      0.36      0.32       517
weighted avg       0.35      0.41      0.37       517

