In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk
import joblib

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Load data
df = pd.read_csv('../dataset/Resume.csv')

# Advanced text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['resume_processed'] = df['Resume_str'].apply(preprocess_text)

# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['resume_processed'])

# Define target variable and encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])

# # Address class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train the XGBoost model
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

# Evaluate the model
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))




XGBoost Accuracy: 0.8229

Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.88      0.83      0.86        18
              ADVOCATE       0.81      0.81      0.81        32
           AGRICULTURE       0.79      0.71      0.75        21
               APPAREL       0.71      0.71      0.71        14
                  ARTS       0.85      0.68      0.76        25
            AUTOMOBILE       0.78      0.91      0.84        23
              AVIATION       0.78      0.84      0.81        25
               BANKING       0.64      0.70      0.67        20
                   BPO       0.92      1.00      0.96        22
  BUSINESS-DEVELOPMENT       0.84      0.89      0.86        18
                  CHEF       0.97      0.91      0.94        35
          CONSTRUCTION       0.85      0.96      0.90        23
            CONSULTANT       0.68      0.72      0.70        18
              DESIGNER       0.86      0.92      0.89 

In [2]:
joblib.dump(xgb, '../pkl/xgb_model.pkl')
joblib.dump(vectorizer, '../pkl/vectorizer.pkl')
joblib.dump(label_encoder, '../pkl/label_encoder.pkl')

['../pkl/label_encoder.pkl']

In [7]:
# from xgboost import DMatrix, train

# # Fine-tuning with Grid Search
# param_grid = {
#     'n_estimators': [200, 300],
#     'learning_rate': [0.1, 0.2],
#     'max_depth': [3, 4],
#     'subsample': [0.8, 1.0]
# }

# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
# grid_search.fit(X_train, y_train)

# # Evaluate the fine-tuned model
# best_xgb = grid_search.best_estimator_
# dtrain = DMatrix(X_train, label=y_train)
# dtest = DMatrix(X_test, label=y_test)

# params = best_xgb.get_params()
# params['num_class'] = len(np.unique(y_train))  # Add num_class parameter

# # Train the model with early stopping
# best_xgb_model = train(params, dtrain, num_boost_round=params['n_estimators'], early_stopping_rounds=10, evals=[(dtest, 'eval')], verbose_eval=False)

# # Predictions with the fine-tuned model
# y_pred = best_xgb_model.predict(dtest)
# y_pred = np.argmax(y_pred, axis=1)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"\nBest Model (XGBoost) Accuracy: {accuracy:.4f}")
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

from xgboost import DMatrix, train
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Define the best parameters (manually set or after GridSearch-like process)
best_params = {
    'eta': 0.1,  # learning_rate
    'max_depth': 3,
    'subsample': 0.8,
    'objective': 'multi:softprob',
    'num_class': len(np.unique(y_train))
}

# Convert data to DMatrix format
dtrain = DMatrix(X_train, label=y_train)
dtest = DMatrix(X_test, label=y_test)

# Train the model with early stopping
evals = [(dtrain, 'train'), (dtest, 'eval')]
booster = train(best_params, dtrain, num_boost_round=300, early_stopping_rounds=10, evals=evals, verbose_eval=False)

# Make predictions
y_pred_prob = booster.predict(dtest)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nBest Model (XGBoost) Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))



Best Model (XGBoost) Accuracy: 0.8403

Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.85      0.94      0.89        18
              ADVOCATE       0.81      0.78      0.79        32
           AGRICULTURE       0.85      0.81      0.83        21
               APPAREL       0.62      0.57      0.59        14
                  ARTS       0.89      0.68      0.77        25
            AUTOMOBILE       0.83      0.87      0.85        23
              AVIATION       0.81      0.88      0.85        25
               BANKING       0.68      0.65      0.67        20
                   BPO       1.00      1.00      1.00        22
  BUSINESS-DEVELOPMENT       0.84      0.89      0.86        18
                  CHEF       0.97      0.91      0.94        35
          CONSTRUCTION       0.84      0.91      0.88        23
            CONSULTANT       0.73      0.89      0.80        18
              DESIGNER       0.89      0

In [8]:
joblib.dump(xgb, '../pkl/best_xgb_model.pkl')

['../pkl/best_xgb_model.pkl']