In [10]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]='2'
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay

from transformers import BertTokenizer
from datasets import Dataset

import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainerCallback
import warnings
warnings.filterwarnings('ignore', message="A parameter name that contains `beta` will be renamed internally to `bias`")

In [11]:
data=pd.read_csv("/kaggle/input/resume-dataset/Resume/Resume.csv")

In [12]:
df=data.copy()
df.drop(columns=["Resume_html"],inplace=True)
# Encode the labels
df['Category'] = df['Category'].astype('category')
df['label'] = df['Category'].cat.codes

In [13]:
# Split dataset
train, temp = train_test_split(df, test_size=0.3, stratify=df['label'])
val, test = train_test_split(temp, test_size=0.3, stratify=temp['label'])

In [14]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
# Define a function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['Resume_str'], padding='max_length', truncation=True)

In [16]:
# Convert DataFrames to Hugging Face Datasets
train_df = Dataset.from_pandas(train)
val_df = Dataset.from_pandas(val)
test_df = Dataset.from_pandas(test)

In [17]:
# Tokenize datasets
train_df = train_df.map(tokenize_function, batched=True)
val_df= val_df.map(tokenize_function, batched=True)
test_df = test_df.map(tokenize_function, batched=True)

Map:   0%|          | 0/1738 [00:00<?, ? examples/s]

Map:   0%|          | 0/522 [00:00<?, ? examples/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

In [18]:
# Set the input columns 
train_df.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_df.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_df.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [19]:
# Load BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['label'].unique()))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
#call back to save full model
class CustomCallback(TrainerCallback):
    def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
        # Save the entire model as a single file
        torch.save(model,'/kaggle/working/model.pt')


# Define the metrics for evaluation
def compute_metrics(p):
    pred, labels = p
    if isinstance(pred, np.ndarray):
        pred = torch.tensor(pred)
    pred = torch.argmax(pred, axis=1)
    acc = accuracy_score(labels, pred)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_steps=500,                  
    save_steps=1000, 
    save_total_limit=2
)

# Initialize the Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    compute_metrics=compute_metrics,
    callbacks=[CustomCallback()]
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)



In [21]:
# Train 
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112607299997359, max=1.0‚Ä¶

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.6772,2.452451,0.524904,0.472408,0.495932,0.524904
2,1.5971,1.471964,0.754789,0.71708,0.739034,0.754789
3,1.1835,1.09313,0.814176,0.793328,0.801651,0.814176
4,0.8718,0.93065,0.833333,0.81594,0.811248,0.833333
5,0.818,0.890712,0.835249,0.81847,0.812328,0.835249


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=545, training_loss=1.5692202795536145, metrics={'train_runtime': 969.8346, 'train_samples_per_second': 8.96, 'train_steps_per_second': 0.562, 'total_flos': 2286886708592640.0, 'train_loss': 1.5692202795536145, 'epoch': 5.0})

In [22]:
# evaluate on test set
trainer.evaluate(test_df)

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.9517273902893066,
 'eval_accuracy': 0.8214285714285714,
 'eval_f1': 0.8048913298850502,
 'eval_precision': 0.8052511724386724,
 'eval_recall': 0.8214285714285714,
 'eval_runtime': 3.4391,
 'eval_samples_per_second': 65.133,
 'eval_steps_per_second': 4.071,
 'epoch': 5.0}

In [23]:
# predictions 
predictions = trainer.predict(test_df)
# Extract labels
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# confusion matrix
cm = confusion_matrix(labels, preds)

  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
!pip install PyPDF2

  pid, fd = os.forkpty()


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [26]:
# Load the trained model and tokenizer
model_path = '/kaggle/working/results/checkpoint-545'  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

# Function to predict resume category
def predict_resume_category(resume_text):
    inputs = tokenizer(resume_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    return predicted_class_id

# Define category labels
category_labels = df['Category'].cat.categories.tolist()

# Script to categorize resumes
def categorize_resumes(directory):
    resume_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]  
    categorized_data = []
    
    for resume in resume_files:
        resume_path = os.path.join(directory, resume)
        resume_text = extract_text_from_pdf(resume_path)  
        
        predicted_category = predict_resume_category(resume_text)
        category_name = category_labels[predicted_category]
        
        # writable directory for categorized resumes
        writable_dir = os.path.join('/kaggle/working/categorized_resumes', category_name)
        os.makedirs(writable_dir, exist_ok=True)
        
        # resume to the predicted category folder 
        shutil.copy(resume_path, os.path.join(writable_dir, resume))
        
        # Append to categorized data for CSV
        categorized_data.append({"filename": resume, "category": category_name})
    
    # Save resulT
    categorized_resumes_df = pd.DataFrame(categorized_data)
    categorized_resumes_df.to_csv('/kaggle/working/categorized_resumes.csv', index=False)

# PDF text extraction 
def extract_text_from_pdf(pdf_path):
    from PyPDF2 import PdfReader
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [27]:
# Usage: Pass the directory containing resumes
categorize_resumes('/kaggle/input/resume-dataset/data/data/ACCOUNTANT')

In [32]:
output_csv=pd.read_csv("/kaggle/working/categorized_resumes.csv")
output_csv.head()

Unnamed: 0,filename,category
0,27558837.pdf,ACCOUNTANT
1,25547145.pdf,ACCOUNTANT
2,28298773.pdf,ACCOUNTANT
3,22465498.pdf,ACCOUNTANT
4,15363277.pdf,ACCOUNTANT
