# Step 1: Import libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load dataset

In [4]:
df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")


# Peek at the data

In [5]:
print(df.head())


  StudentID     Q1_Favorite_Subjects         Q2_Enjoyed_Activities  \
0    S00001   Economics, Accountancy             Debating, Reading   
1    S00002  Computer Science, Maths         Experiments, Research   
2    S00003  Maths, Computer Science       Public Speaking, Coding   
3    S00004       Chemistry, Physics               Drawing, Sports   
4    S00005       Physics, Chemistry  Experiments, Solving Puzzles   

             Q3_Strongest_Skills Q4_Work_Style Q5_Workplace_Preference  \
0      Communication, Creativity          Both                 Startup   
1       Design Thinking, Writing     Practical            Research Lab   
2  Problem Solving, Presentation   Theoretical                 Startup   
3            Leadership, Writing     Practical                Outdoors   
4             Research, Teamwork   Theoretical                Outdoors   

  Q6_Exam_Readiness Q7_Location_Preference      Q8_Career_Values  \
0             Maybe                 Abroad          Job Security  

# Step 3: Split features/labels

In [3]:
import pandas as pd

df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")

print(df.columns)   # show all column names
print(df.head())    # preview first rows


Index(['StudentID', 'Q1_Favorite_Subjects', 'Q2_Enjoyed_Activities',
       'Q3_Strongest_Skills', 'Q4_Work_Style', 'Q5_Workplace_Preference',
       'Q6_Exam_Readiness', 'Q7_Location_Preference', 'Q8_Career_Values',
       'Q9_LongTerm_Goal', 'Q10_Academic_Background', 'Recommended_Course',
       'Recommended_Career', 'Recommended_College_Type',
       'Recommendation_Score'],
      dtype='object')
  StudentID     Q1_Favorite_Subjects         Q2_Enjoyed_Activities  \
0    S00001   Economics, Accountancy             Debating, Reading   
1    S00002  Computer Science, Maths         Experiments, Research   
2    S00003  Maths, Computer Science       Public Speaking, Coding   
3    S00004       Chemistry, Physics               Drawing, Sports   
4    S00005       Physics, Chemistry  Experiments, Solving Puzzles   

             Q3_Strongest_Skills Q4_Work_Style Q5_Workplace_Preference  \
0      Communication, Creativity          Both                 Startup   
1       Design Thinking, Wr

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")

# Combine all quiz answers into a single text feature
feature_cols = [
    'Q1_Favorite_Subjects', 'Q2_Enjoyed_Activities', 'Q3_Strongest_Skills',
    'Q4_Work_Style', 'Q5_Workplace_Preference', 'Q6_Exam_Readiness',
    'Q7_Location_Preference', 'Q8_Career_Values', 'Q9_LongTerm_Goal',
    'Q10_Academic_Background'
]

df["combined_features"] = df[feature_cols].astype(str).agg(" ".join, axis=1)

# Features and target
X = df["combined_features"]
y = df["Recommended_Career"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Build pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", LogisticRegression(max_iter=2000))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.3125
                           precision    recall  f1-score   support

               Accountant       0.25      0.12      0.17         8
          Artist/Designer       0.09      0.07      0.08        14
          Ayurveda Doctor       0.00      0.00      0.00         1
         Business Analyst       0.00      0.00      0.00         6
           Civil Engineer       0.00      0.00      0.00        12
             Counselor/HR       0.25      0.09      0.13        11
                  Dentist       0.00      0.00      0.00         1
                   Doctor       0.00      0.00      0.00         1
        Economist/Analyst       0.25      0.10      0.14        10
Embedded Systems Engineer       0.14      0.07      0.10        14
        Financial Analyst       0.00      0.00      0.00         6
        Homeopathy Doctor       0.00      0.00      0.00         1
              IT Engineer       0.00      0.00      0.00        11
    IT Support/Technician       0.17      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression

In [5]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])


Random Forest with Label Encoding

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Encode target
le = LabelEncoder()
y = le.fit_transform(df["Recommended_Career"])

# Encode categorical text features as strings
X = df[feature_cols].astype(str)

# Simple bag-of-words encoding per column (concat all text)
X_combined = X.agg(" ".join, axis=1)

# Vectorize
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_vec = vectorizer.fit_transform(X_combined)

# Train Random Forest
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_vec, y)

# Evaluate
y_pred = model.predict(X_vec)
print("Train accuracy:", accuracy_score(y, y_pred))


Train accuracy: 1.0


BERT (Best Long-Term Approach)

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load data
df["combined_features"] = df[feature_cols].astype(str).agg(" ".join, axis=1)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["combined_features"], df["Recommended_Career"], test_size=0.2, stratify=df["Recommended_Career"]
)

# Hugging Face dataset
dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})

# Tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# (need to encode labels to integers here before training)



In [8]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Train Test Split


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=y)

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Test accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Test accuracy: 0.3541666666666667
                           precision    recall  f1-score   support

               Accountant       0.60      0.38      0.46         8
          Artist/Designer       0.25      0.21      0.23        14
          Ayurveda Doctor       0.00      0.00      0.00         1
         Business Analyst       0.25      0.17      0.20         6
           Civil Engineer       0.33      0.17      0.22        12
             Counselor/HR       0.67      0.36      0.47        11
                  Dentist       0.00      0.00      0.00         1
                   Doctor       0.00      0.00      0.00         1
        Economist/Analyst       0.20      0.10      0.13        10
Embedded Systems Engineer       0.18      0.29      0.22        14
        Financial Analyst       0.33      0.17      0.22         6
        Homeopathy Doctor       0.00      0.00      0.00         1
              IT Engineer       0.00      0.00      0.00        11
    IT Support/Technician  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


2. Cross-Validation (More Reliable)

In [10]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_vec, y, cv=5)
print("Cross-validation accuracy:", scores.mean())




Cross-validation accuracy: 0.30833333333333335


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

STEP 1: Load your dataset

In [15]:
df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")

print("Columns:", df.columns.tolist())
print("Number of samples:", len(df))

Columns: ['StudentID', 'Q1_Favorite_Subjects', 'Q2_Enjoyed_Activities', 'Q3_Strongest_Skills', 'Q4_Work_Style', 'Q5_Workplace_Preference', 'Q6_Exam_Readiness', 'Q7_Location_Preference', 'Q8_Career_Values', 'Q9_LongTerm_Goal', 'Q10_Academic_Background', 'Recommended_Course', 'Recommended_Career', 'Recommended_College_Type', 'Recommendation_Score']
Number of samples: 1200


In [17]:
career_clusters = {
    "Doctor": ["Doctor", "Dentist", "Ayurveda Doctor", "Homeopathy Doctor"],
    "Engineer": ["Software Engineer", "Civil Engineer", "Mechanical Engineer",
                 "IT Engineer", "Embedded Systems Engineer"],
    "Technician": ["Technician - Electrical", "Technician - Mechanical", "IT Support/Technician"],
    "Analyst": ["Business Analyst", "Economist/Analyst", "Investment Analyst", 
                "Financial Analyst", "Public Policy Analyst"],
    "Designer": ["Artist/Designer", "Junior Designer"],
    "Research": ["Researcher/Archivist"],
    "Account/Finance": ["Accountant"],
    "Counseling": ["Counselor/HR"],
}

# Reverse mapping: career → cluster
career_to_cluster = {}
for cluster, careers in career_clusters.items():
    for c in careers:
        career_to_cluster[c] = cluster

# Map to clusters
df["CareerCluster"] = df["Recommended_Career"].map(career_to_cluster)

# Drop rows with unmapped careers
df = df.dropna(subset=["CareerCluster"])

print("Unique career clusters:", df["CareerCluster"].unique())

Unique career clusters: ['Account/Finance' 'Engineer' 'Doctor' 'Analyst' 'Technician' 'Designer'
 'Counseling' 'Research']


In [18]:
# ==============================
# STEP 3: Encode target labels
# ==============================
le = LabelEncoder()
y = le.fit_transform(df["CareerCluster"])


In [21]:
print("Final dataframe columns:", df.columns.tolist())


Final dataframe columns: ['StudentID', 'Q1_Favorite_Subjects', 'Q2_Enjoyed_Activities', 'Q3_Strongest_Skills', 'Q4_Work_Style', 'Q5_Workplace_Preference', 'Q6_Exam_Readiness', 'Q7_Location_Preference', 'Q8_Career_Values', 'Q9_LongTerm_Goal', 'Q10_Academic_Background', 'Recommended_Course', 'Recommended_Career', 'Recommended_College_Type', 'Recommendation_Score', 'CareerCluster']


In [24]:
print(df.columns)


Index(['StudentID', 'Q1_Favorite_Subjects', 'Q2_Enjoyed_Activities',
       'Q3_Strongest_Skills', 'Q4_Work_Style', 'Q5_Workplace_Preference',
       'Q6_Exam_Readiness', 'Q7_Location_Preference', 'Q8_Career_Values',
       'Q9_LongTerm_Goal', 'Q10_Academic_Background', 'Recommended_Course',
       'Recommended_Career', 'Recommended_College_Type',
       'Recommendation_Score', 'CareerCluster'],
      dtype='object')


In [27]:
# ==============================
# STEP 4: Create text features
# ==============================

feature_cols = [
    "Q1_Favorite_Subjects",
    "Q2_Enjoyed_Activities",
    "Q3_Strongest_Skills",
    "Q4_Work_Style",
    "Q5_Workplace_Preference",
    "Q6_Exam_Readiness",
    "Q7_Location_Preference",
    "Q8_Career_Values",
    "Q9_LongTerm_Goal",
    "Q10_Academic_Background"
]

# Combine all text columns into one string per student
df_features = df[feature_cols].astype(str)
X_text = df_features.agg(" ".join, axis=1)

# Generate embeddings using sentence-transformers
from sentence_transformers import SentenceTransformer

model_emb = SentenceTransformer("all-MiniLM-L6-v2")
X_vec = model_emb.encode(X_text.tolist(), show_progress_bar=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 38/38 [00:02<00:00, 15.58it/s]


In [28]:
# ==============================
# STEP 4: Create text features
# ==============================
feature_cols = [
    "Q1_Favorite_Subjects",
    "Q2_Enjoyed_Activities",
    "Q3_Strongest_Skills",
    "Q4_Work_Style",
    "Q5_Workplace_Preference",
    "Q6_Exam_Readiness",
    "Q7_Location_Preference",
    "Q8_Career_Values",
    "Q9_LongTerm_Goal",
    "Q10_Academic_Background"
]

# Combine all text columns into a single string per student
df_features = df[feature_cols].astype(str)
X_text = df_features.agg(" ".join, axis=1)

# Generate embeddings
from sentence_transformers import SentenceTransformer
model_emb = SentenceTransformer("all-MiniLM-L6-v2")
X_vec = model_emb.encode(X_text.tolist(), show_progress_bar=True)

# ==============================
# STEP 5: Train-Test Split
# ==============================
from sklearn.model_selection import train_test_split

y = df["CareerCluster"].values  # target variable
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

# ==============================
# STEP 6: Train RandomForest
# ==============================
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",  # handle imbalanced clusters
    random_state=42
)
clf.fit(X_train, y_train)

# ==============================
# STEP 7: Evaluate
# ==============================
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Batches: 100%|██████████| 38/38 [00:02<00:00, 15.90it/s]



Test Accuracy: 0.5

Classification Report:
                  precision    recall  f1-score   support

Account/Finance       0.00      0.00      0.00         8
        Analyst       0.44      0.34      0.38        41
     Counseling       0.00      0.00      0.00        11
       Designer       0.00      0.00      0.00        26
         Doctor       0.00      0.00      0.00         3
       Engineer       0.51      1.00      0.68       105
       Research       0.00      0.00      0.00        13
     Technician       1.00      0.03      0.06        33

       accuracy                           0.50       240
      macro avg       0.24      0.17      0.14       240
   weighted avg       0.44      0.50      0.37       240



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# ==============================
# STEP 1: Imports
# ==============================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sentence_transformers import SentenceTransformer

# ==============================
# STEP 2: Load dataset
# ==============================
df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")  # replace with your CSV path

# ==============================
# STEP 3: Define career clusters
# ==============================
career_clusters = {
    'Engineering': ['Engineer', 'Technician', 'Embedded Systems Engineer', 'IT Engineer', 'IT Support/Technician', 'Mechanical Engineer', 'Civil Engineer'],
    'Business & Finance': ['Account/Finance', 'Analyst', 'Financial Analyst', 'Investment Analyst', 'Business Analyst'],
    'Design & Creative': ['Designer', 'Artist/Designer', 'UX Designer', 'Graphic Designer', 'Junior Designer'],
    'Healthcare': ['Doctor', 'Counseling', 'Ayurveda Doctor', 'Homeopathy Doctor', 'Dentist'],
    'Research & Academics': ['Researcher', 'Public Policy Analyst', 'Economist/Analyst', 'Researcher/Archivist']
}

# Map individual career to cluster
def map_to_cluster(career):
    for cluster, careers in career_clusters.items():
        for c in careers:
            if c.lower() in str(career).lower():
                return cluster
    return None

df['CareerCluster'] = df['Recommended_Career'].apply(map_to_cluster)
df = df.dropna(subset=['CareerCluster'])  # drop rows with unmapped careers

# ==============================
# STEP 4: Text features
# ==============================
feature_cols = [
    "Q1_Favorite_Subjects",
    "Q2_Enjoyed_Activities",
    "Q3_Strongest_Skills",
    "Q4_Work_Style",
    "Q5_Workplace_Preference",
    "Q6_Exam_Readiness",
    "Q7_Location_Preference",
    "Q8_Career_Values",
    "Q9_LongTerm_Goal",
    "Q10_Academic_Background"
]

df_text = df[feature_cols].astype(str)
X_text = df_text.agg(" ".join, axis=1)

# Sentence embeddings
model_emb = SentenceTransformer("all-MiniLM-L6-v2")
X_vec = model_emb.encode(X_text.tolist(), show_progress_bar=True)

# Encode target labels
le = LabelEncoder()
y = le.fit_transform(df['CareerCluster'])

# ==============================
# STEP 5: Train-Test Split & SMOTE
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# ==============================
# STEP 6: Train Ensemble Model
# ==============================
xgb_clf = XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    objective='multi:softmax',
    random_state=42,
    eval_metric='mlogloss'
)

rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight='balanced',
    random_state=42
)

lr_clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('lr', lr_clf)],
    voting='hard'
)

voting_clf.fit(X_train_res, y_train_res)

# ==============================
# STEP 7: Evaluation
# ==============================
y_pred = voting_clf.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Batches: 100%|██████████| 35/35 [00:02<00:00, 15.94it/s]



Test Accuracy: 0.7207207207207207

Classification Report:
                       precision    recall  f1-score   support

  Business & Finance       0.62      0.63      0.63        41
   Design & Creative       0.09      0.04      0.05        26
         Engineering       0.83      0.94      0.88       138
          Healthcare       0.00      0.00      0.00         3
Research & Academics       0.25      0.21      0.23        14

            accuracy                           0.72       222
           macro avg       0.36      0.37      0.36       222
        weighted avg       0.66      0.72      0.68       222



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
# ==============================
# STEP 1: Imports
# ==============================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# ==============================
# STEP 2: Load dataset
# ==============================
df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")  # replace with your CSV path

# ==============================
# STEP 3: Define career clusters
# ==============================
career_clusters = {
    'Engineering': ['Engineer', 'Technician', 'Embedded Systems Engineer', 'IT Engineer', 'IT Support/Technician', 'Mechanical Engineer', 'Civil Engineer'],
    'Business & Finance': ['Account/Finance', 'Analyst', 'Financial Analyst', 'Investment Analyst', 'Business Analyst'],
    'Design & Creative': ['Designer', 'Artist/Designer', 'UX Designer', 'Graphic Designer', 'Junior Designer'],
    'Healthcare': ['Doctor', 'Counseling', 'Ayurveda Doctor', 'Homeopathy Doctor', 'Dentist'],
    'Research & Academics': ['Researcher', 'Public Policy Analyst', 'Economist/Analyst', 'Researcher/Archivist']
}

def map_to_cluster(career):
    for cluster, careers in career_clusters.items():
        for c in careers:
            if c.lower() in str(career).lower():
                return cluster
    return None

df['CareerCluster'] = df['Recommended_Career'].apply(map_to_cluster)
df = df.dropna(subset=['CareerCluster'])

# ==============================
# STEP 4: Combine text columns
# ==============================
feature_cols = [
    "Q1_Favorite_Subjects",
    "Q2_Enjoyed_Activities",
    "Q3_Strongest_Skills",
    "Q4_Work_Style",
    "Q5_Workplace_Preference",
    "Q6_Exam_Readiness",
    "Q7_Location_Preference",
    "Q8_Career_Values",
    "Q9_LongTerm_Goal",
    "Q10_Academic_Background"
]

df_text = df[feature_cols].astype(str)
X_text = df_text.agg(" ".join, axis=1)

# ==============================
# STEP 5: Create embeddings and TF-IDF features
# ==============================
# Sentence embeddings
model_emb = SentenceTransformer("all-MiniLM-L6-v2")
X_embeddings = model_emb.encode(X_text.tolist(), show_progress_bar=True)

# TF-IDF features
vectorizer = TfidfVectorizer(max_features=3000, stop_words="english")
X_tfidf = vectorizer.fit_transform(X_text)

# Combine embeddings + TF-IDF
X_combined = np.hstack([X_embeddings, X_tfidf.toarray()])

# Encode target labels
le = LabelEncoder()
y = le.fit_transform(df['CareerCluster'])

# ==============================
# STEP 6: Train-Test Split & SMOTE
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# ==============================
# STEP 7: Train Soft Voting Ensemble
# ==============================
xgb_clf = XGBClassifier(
    n_estimators=250,
    max_depth=8,
    learning_rate=0.1,
    objective='multi:softprob',  # softprob for probability outputs
    eval_metric='mlogloss',
    random_state=42
)

rf_clf = RandomForestClassifier(
    n_estimators=250,
    max_depth=None,
    class_weight='balanced',
    random_state=42
)

lr_clf = LogisticRegression(
    max_iter=2000,
    class_weight='balanced',
    random_state=42
)

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('lr', lr_clf)],
    voting='soft'  # use probabilities to reduce bias toward large classes
)

voting_clf.fit(X_train_res, y_train_res)

# ==============================
# STEP 8: Evaluate Model
# ==============================
y_pred = voting_clf.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Batches: 100%|██████████| 35/35 [00:02<00:00, 16.50it/s]



Test Accuracy: 0.7432432432432432

Classification Report:
                       precision    recall  f1-score   support

  Business & Finance       0.62      0.56      0.59        41
   Design & Creative       0.22      0.08      0.11        26
         Engineering       0.86      0.99      0.92       138
          Healthcare       0.00      0.00      0.00         3
Research & Academics       0.24      0.29      0.26        14

            accuracy                           0.74       222
           macro avg       0.39      0.38      0.38       222
        weighted avg       0.69      0.74      0.71       222

