# Step 1: Import libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load dataset

In [4]:
df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")


# Peek at the data

In [5]:
print(df.head())


  StudentID     Q1_Favorite_Subjects         Q2_Enjoyed_Activities  \
0    S00001   Economics, Accountancy             Debating, Reading   
1    S00002  Computer Science, Maths         Experiments, Research   
2    S00003  Maths, Computer Science       Public Speaking, Coding   
3    S00004       Chemistry, Physics               Drawing, Sports   
4    S00005       Physics, Chemistry  Experiments, Solving Puzzles   

             Q3_Strongest_Skills Q4_Work_Style Q5_Workplace_Preference  \
0      Communication, Creativity          Both                 Startup   
1       Design Thinking, Writing     Practical            Research Lab   
2  Problem Solving, Presentation   Theoretical                 Startup   
3            Leadership, Writing     Practical                Outdoors   
4             Research, Teamwork   Theoretical                Outdoors   

  Q6_Exam_Readiness Q7_Location_Preference      Q8_Career_Values  \
0             Maybe                 Abroad          Job Security  

# Step 3: Split features/labels

In [8]:
import pandas as pd

df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")

print(df.columns)   # show all column names
print(df.head())    # preview first rows


Index(['StudentID', 'Q1_Favorite_Subjects', 'Q2_Enjoyed_Activities',
       'Q3_Strongest_Skills', 'Q4_Work_Style', 'Q5_Workplace_Preference',
       'Q6_Exam_Readiness', 'Q7_Location_Preference', 'Q8_Career_Values',
       'Q9_LongTerm_Goal', 'Q10_Academic_Background', 'Recommended_Course',
       'Recommended_Career', 'Recommended_College_Type',
       'Recommendation_Score'],
      dtype='object')
  StudentID     Q1_Favorite_Subjects         Q2_Enjoyed_Activities  \
0    S00001   Economics, Accountancy             Debating, Reading   
1    S00002  Computer Science, Maths         Experiments, Research   
2    S00003  Maths, Computer Science       Public Speaking, Coding   
3    S00004       Chemistry, Physics               Drawing, Sports   
4    S00005       Physics, Chemistry  Experiments, Solving Puzzles   

             Q3_Strongest_Skills Q4_Work_Style Q5_Workplace_Preference  \
0      Communication, Creativity          Both                 Startup   
1       Design Thinking, Wr

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv("quiz/career_quiz_dataset_1200.csv")

# Combine all quiz answers into a single text feature
feature_cols = [
    'Q1_Favorite_Subjects', 'Q2_Enjoyed_Activities', 'Q3_Strongest_Skills',
    'Q4_Work_Style', 'Q5_Workplace_Preference', 'Q6_Exam_Readiness',
    'Q7_Location_Preference', 'Q8_Career_Values', 'Q9_LongTerm_Goal',
    'Q10_Academic_Background'
]

df["combined_features"] = df[feature_cols].astype(str).agg(" ".join, axis=1)

# Features and target
X = df["combined_features"]
y = df["Recommended_Career"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Build pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", LogisticRegression(max_iter=2000))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.3125
                           precision    recall  f1-score   support

               Accountant       0.25      0.12      0.17         8
          Artist/Designer       0.09      0.07      0.08        14
          Ayurveda Doctor       0.00      0.00      0.00         1
         Business Analyst       0.00      0.00      0.00         6
           Civil Engineer       0.00      0.00      0.00        12
             Counselor/HR       0.25      0.09      0.13        11
                  Dentist       0.00      0.00      0.00         1
                   Doctor       0.00      0.00      0.00         1
        Economist/Analyst       0.25      0.10      0.14        10
Embedded Systems Engineer       0.14      0.07      0.10        14
        Financial Analyst       0.00      0.00      0.00         6
        Homeopathy Doctor       0.00      0.00      0.00         1
              IT Engineer       0.00      0.00      0.00        11
    IT Support/Technician       0.17      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression

In [12]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])


Random Forest with Label Encoding

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Encode target
le = LabelEncoder()
y = le.fit_transform(df["Recommended_Career"])

# Encode categorical text features as strings
X = df[feature_cols].astype(str)

# Simple bag-of-words encoding per column (concat all text)
X_combined = X.agg(" ".join, axis=1)

# Vectorize
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_vec = vectorizer.fit_transform(X_combined)

# Train Random Forest
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_vec, y)

# Evaluate
y_pred = model.predict(X_vec)
print("Train accuracy:", accuracy_score(y, y_pred))


Train accuracy: 1.0


BERT (Best Long-Term Approach)

In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load data
df["combined_features"] = df[feature_cols].astype(str).agg(" ".join, axis=1)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["combined_features"], df["Recommended_Career"], test_size=0.2, stratify=df["Recommended_Career"]
)

# Hugging Face dataset
dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})

# Tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# (need to encode labels to integers here before training)



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [19]:
pip install datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp310-cp310-win_amd64.whl.metadata (7.9 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
