## Import Libraries

In [1]:
!pip3 install pandas scikit-learn --break-system-packages

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB, ComplementNB

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.3-cp39-cp39-macosx_11_0_a

## Load Dataset

In [2]:
df = pd.read_csv("../datasets/ResumeDataSet.csv")
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


## Split into Train & Test

In [3]:
x, y = df["Resume"], df["Category"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35, random_state=42)

## Create NLP Pipeline Function

In [4]:
def make_model(model):
    return Pipeline([
        ('tfidf', TfidfVectorizer(encoding='utf-8', decode_error='ignore',lowercase=True,
                                  stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b',)),
        ('scale', MaxAbsScaler()),
        ('select', SelectKBest(k=5000)),
        ('model', model)
    ])

## Define Models to Compare

In [5]:
models = {
    "Multinomial NB": MultinomialNB(),
    "Complement NB": ComplementNB(),
}

## Train & Evaluate Models

In [6]:
for name, model in models.items():
    pipe = make_model(model)
    pipe.fit(x_train, y_train)
    preds = pipe.predict(x_test)
    print(f"----- {name} -----")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

----- Multinomial NB -----
Accuracy: 0.9584569732937686
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         7
                     Arts       1.00      1.00      1.00        12
       Automation Testing       1.00      1.00      1.00         7
               Blockchain       1.00      1.00      1.00        13
         Business Analyst       1.00      1.00      1.00         9
           Civil Engineer       1.00      0.53      0.70        15
             Data Science       1.00      1.00      1.00        13
                 Database       1.00      1.00      1.00        10
          DevOps Engineer       1.00      0.90      0.95        21
         DotNet Developer       1.00      0.55      0.71        11
            ETL Developer       1.00      1.00      1.00        10
   Electrical Engineering       1.00      1.00      1.00        11
                       HR       1.00      1.00      1.00        19
     