In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [63]:

baramati = pd.read_csv('School Data - Baramati.csv')
phaltan = pd.read_csv('School Data - Phaltan.csv')


df_combined = pd.concat([baramati, phaltan], ignore_index=True)


df_combined.columns = df_combined.columns.str.strip()


df_cleaned = df_combined.dropna(subset=['Cluster'])


for col in ['Region', 'Classes', 'medium of instructions', 'Management', 'Pre-primary section (Y/N)']:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].fillna('Unknown')

if 'Establishment' in df_cleaned.columns:
    df_cleaned['Establishment'] = df_cleaned['Establishment'].fillna(df_cleaned['Establishment'].median())

In [64]:

label_encoders = {}
for col in df_cleaned.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col].astype(str))
    label_encoders[col] = le


X = df_cleaned.drop(['Classes'], axis=1)
y = df_cleaned['Classes']


class_counts = y.value_counts()
valid_classes = class_counts[class_counts > 1].index
X = X[y.isin(valid_classes)]
y = y[y.isin(valid_classes)]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


y_all = pd.concat([pd.Series(y_train), pd.Series(y_test)], axis=0)

le_y = LabelEncoder()
le_y.fit(y_all)

y_train = le_y.transform(y_train)
y_test = le_y.transform(y_test)

In [65]:


model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f" Random Forest Model Accuracy: {accuracy * 100:.2f}%\n")
print("Classification Report:\n")
print(classification_report(y_test, y_pred))


xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_


y_pred_best = best_model.predict(X_test)


accuracy_best = accuracy_score(y_test, y_pred_best)
print(f" Final (XGBoost) Model Accuracy after tuning: {accuracy_best * 100:.2f}%\n")
print(" Classification Report (XGBoost):\n")
print(classification_report(y_test, y_pred_best))


 Random Forest Model Accuracy: 60.62%

Classification Report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       0.62      0.83      0.71        36
           3       0.20      0.09      0.12        11
           4       0.00      0.00      0.00         1
           5       0.36      0.25      0.30        16
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         0
           8       0.50      1.00      0.67         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.83      0.92      0.87        49
          13       0.00      0.00      0.00         2
          15       1.00      0.11      0.20         9
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          18      