In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import pickle
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [14]:
# Load and preprocess the dataset
df = pd.read_csv("student-scores.csv")

In [15]:

# Drop unnecessary columns
df.drop(columns=['id', 'first_name', 'last_name', 'email'], axis=1, inplace=True)

In [16]:

# Calculate total and average scores
df["total_score"] = (
    df["math_score"] + df["history_score"] + df["physics_score"] +
    df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
)
df["average_score"] = df["total_score"] / 7

In [17]:
# Map categorical variables
gender_map = {'male': 0, 'female': 1}
part_time_job_map = {False: 0, True: 1}
extracurricular_activities_map = {False: 0, True: 1}
career_aspiration_map = {
    'Lawyer': 0, 'Doctor': 1, 'Government Officer': 2, 'Artist': 3, 'Unknown': 4,
    'Software Engineer': 5, 'Teacher': 6, 'Business Owner': 7, 'Scientist': 8,
    'Banker': 9, 'Writer': 10, 'Accountant': 11, 'Designer': 12,
    'Construction Engineer': 13, 'Game Developer': 14, 'Stock Investor': 15,
    'Real Estate Developer': 16
}

df['gender'] = df['gender'].map(gender_map)
df['part_time_job'] = df['part_time_job'].map(part_time_job_map)
df['extracurricular_activities'] = df['extracurricular_activities'].map(extracurricular_activities_map)
df['career_aspiration'] = df['career_aspiration'].map(career_aspiration_map)

In [18]:
# Handle imbalanced data using SMOTE
X = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [19]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [20]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
# Define classifiers
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Train and evaluate models
for name, model in models.items():
    print("=" * 50)
    print("Model:", name)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Model: Logistic Regression
Accuracy: 0.47432306255835666
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.51      0.48        68
           1       0.49      0.67      0.57        72
           2       0.38      0.42      0.40        57
           3       0.53      0.53      0.53        58
           4       0.35      0.17      0.23        66
           5       0.33      0.32      0.32        76
           6       0.54      0.89      0.67        71
           7       0.79      0.79      0.79        61
           8       0.45      0.47      0.46        53
           9       0.22      0.07      0.10        61
          10       0.58      0.75      0.65        63
          11       0.44      0.47      0.45        53
          12       0.28      0.13      0.18        68
          13       0.35      0.47      0.40        55
          14       0.60      0.89      0.72        57
          15       0.34      0.22      0.27        63


In [22]:
# Random Forest Model - detailed evaluation
print("\n=== Random Forest Classifier Evaluation ===")
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


=== Random Forest Classifier Evaluation ===
Accuracy: 0.8338001867413632
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.85      0.79        68
           1       0.77      1.00      0.87        72
           2       0.80      0.96      0.87        57
           3       0.85      0.97      0.90        58
           4       0.85      0.44      0.58        66
           5       0.62      0.43      0.51        76
           6       0.92      1.00      0.96        71
           7       0.96      0.89      0.92        61
           8       0.83      0.98      0.90        53
           9       0.72      0.69      0.71        61
          10       0.91      0.97      0.94        63
          11       0.87      0.75      0.81        53
          12       0.91      0.87      0.89        68
          13       0.77      0.96      0.85        55
          14       0.93      0.98      0.96        57
          15       0.90      0.70    

In [23]:
# Save the model and scaler
pickle.dump(scaler, open("scaler.pkl", 'wb'))
pickle.dump(rf_model, open("model.pkl", 'wb'))

In [24]:
# Recommendation system
class_names = [
    'Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown', 'Software Engineer',
    'Teacher', 'Business Owner', 'Scientist', 'Banker', 'Writer', 'Accountant', 'Designer',
    'Construction Engineer', 'Game Developer', 'Stock Investor', 'Real Estate Developer'
]

def Recommendations(gender, part_time_job, absence_days, extracurricular_activities,
                    weekly_self_study_hours, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score,
                    total_score, average_score):
    # Encode input
    gender_encoded = 1 if gender.lower() == 'female' else 0
    part_time_job_encoded = 1 if part_time_job else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0

    # Feature array
    feature_array = np.array([[gender_encoded, part_time_job_encoded, absence_days,
                               extracurricular_activities_encoded, weekly_self_study_hours,
                               math_score, history_score, physics_score, chemistry_score,
                               biology_score, english_score, geography_score, total_score, average_score]])

    # Scale and predict
    scaled_features = scaler.transform(feature_array)
    probabilities = rf_model.predict_proba(scaled_features)

    # Get top predictions
    top_classes_idx = np.argsort(-probabilities[0])[:5]
    return [(class_names[idx], probabilities[0][idx]) for idx in top_classes_idx]
