In [1]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


In [2]:
df = pd.read_csv("data/resume_cleaned_dataset.csv")

X = df["clean_text"]
y = df["category"]



In [3]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(X)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
def evaluate(name, y_true, y_pred):
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted F1": f1_score(y_true, y_pred, average="weighted")
    }


In [6]:
results = []

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
results.append(evaluate("Logistic Regression", y_test, lr.predict(X_test)))

svm = LinearSVC()
svm.fit(X_train, y_train)
results.append(evaluate("Linear SVM", y_test, svm.predict(X_test)))

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
results.append(evaluate("Random Forest", y_test, rf.predict(X_test)))

nb = MultinomialNB()
nb.fit(X_train, y_train)
results.append(evaluate("Naive Bayes", y_test, nb.predict(X_test)))

comparison_df = pd.DataFrame(results)
comparison_df.sort_values("Macro F1", ascending=False)


Unnamed: 0,Model,Accuracy,Macro F1,Weighted F1
0,Logistic Regression,0.9375,0.777778,0.909722
1,Linear SVM,0.9375,0.777778,0.909722
2,Random Forest,0.9375,0.777778,0.909722
3,Naive Bayes,0.875,0.731429,0.851786


In [7]:
joblib.dump(svm, "resume_classifier_svm.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved")


Model and vectorizer saved
