        Loading Dataset

In [23]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('/Users/zainnofal/Desktop/Resume Screening2/Resume/Resume.csv')

# Preview the data
df.head()


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [24]:
df.drop(columns = ['ID', 'Resume_html'], inplace = True)

In [25]:
df.head()

Unnamed: 0,Resume_str,Category
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR


In [26]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode labels
df['Category'] = label_encoder.fit_transform(df['Category'])


In [28]:
# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train and test sets
dataset = dataset.train_test_split(test_size=0.2)

        Preprocessing and Tokenization

In [29]:
# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')




In [30]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['Resume_str'], padding="max_length", truncation=True)


In [31]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1987/1987 [00:15<00:00, 127.04 examples/s]
Map: 100%|██████████| 497/497 [00:03<00:00, 133.51 examples/s]


In [32]:
# Rename columns for the model
tokenized_datasets = tokenized_datasets.rename_column('Category', 'labels')

        FineTuning BERT MODEL

In [33]:
# Define the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['Category'].unique()))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)


In [35]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/747 [04:42<?, ?it/s]
  1%|▏         | 10/747 [00:26<34:51,  2.84s/it]

{'loss': 3.1809, 'grad_norm': 2.9100518226623535, 'learning_rate': 1.9732262382864794e-05, 'epoch': 0.04}


  3%|▎         | 20/747 [00:57<36:54,  3.05s/it]

{'loss': 3.1491, 'grad_norm': 2.923863649368286, 'learning_rate': 1.9464524765729587e-05, 'epoch': 0.08}


  4%|▍         | 30/747 [01:28<25:40,  2.15s/it]

{'loss': 3.1587, 'grad_norm': 4.233206748962402, 'learning_rate': 1.9196787148594377e-05, 'epoch': 0.12}


  5%|▌         | 40/747 [02:05<34:10,  2.90s/it]

{'loss': 3.1081, 'grad_norm': 3.5770742893218994, 'learning_rate': 1.8929049531459173e-05, 'epoch': 0.16}


  7%|▋         | 50/747 [02:35<33:27,  2.88s/it]

{'loss': 3.1415, 'grad_norm': 4.214943885803223, 'learning_rate': 1.8661311914323962e-05, 'epoch': 0.2}


  8%|▊         | 60/747 [03:11<43:24,  3.79s/it]

{'loss': 3.102, 'grad_norm': 4.16871452331543, 'learning_rate': 1.8393574297188755e-05, 'epoch': 0.24}


  9%|▉         | 70/747 [03:44<38:56,  3.45s/it]

{'loss': 3.0986, 'grad_norm': 4.134307384490967, 'learning_rate': 1.8125836680053548e-05, 'epoch': 0.28}


 11%|█         | 80/747 [04:18<35:24,  3.19s/it]

{'loss': 3.0514, 'grad_norm': 4.6423468589782715, 'learning_rate': 1.785809906291834e-05, 'epoch': 0.32}


 12%|█▏        | 90/747 [05:13<1:22:44,  7.56s/it]

{'loss': 2.9064, 'grad_norm': 4.755126476287842, 'learning_rate': 1.7590361445783134e-05, 'epoch': 0.36}


 13%|█▎        | 100/747 [05:41<32:18,  3.00s/it] 

{'loss': 2.9116, 'grad_norm': 5.1680731773376465, 'learning_rate': 1.7322623828647926e-05, 'epoch': 0.4}


 15%|█▍        | 110/747 [06:00<17:53,  1.69s/it]

{'loss': 2.8142, 'grad_norm': 5.550776958465576, 'learning_rate': 1.705488621151272e-05, 'epoch': 0.44}


 16%|█▌        | 120/747 [06:41<47:05,  4.51s/it]  

{'loss': 2.654, 'grad_norm': 5.2187347412109375, 'learning_rate': 1.6787148594377512e-05, 'epoch': 0.48}


 17%|█▋        | 130/747 [07:18<35:53,  3.49s/it]

{'loss': 2.6305, 'grad_norm': 4.750263690948486, 'learning_rate': 1.65194109772423e-05, 'epoch': 0.52}


 19%|█▊        | 140/747 [07:47<28:51,  2.85s/it]

{'loss': 2.5601, 'grad_norm': 5.743870258331299, 'learning_rate': 1.6251673360107098e-05, 'epoch': 0.56}


 20%|██        | 150/747 [08:11<22:07,  2.22s/it]

{'loss': 2.4359, 'grad_norm': 5.371191024780273, 'learning_rate': 1.5983935742971887e-05, 'epoch': 0.6}


 21%|██▏       | 160/747 [08:37<22:42,  2.32s/it]

{'loss': 2.4488, 'grad_norm': 6.8319597244262695, 'learning_rate': 1.571619812583668e-05, 'epoch': 0.64}


 23%|██▎       | 170/747 [08:59<20:14,  2.10s/it]

{'loss': 2.2346, 'grad_norm': 5.112199783325195, 'learning_rate': 1.5448460508701473e-05, 'epoch': 0.68}


 24%|██▍       | 180/747 [09:21<21:45,  2.30s/it]

{'loss': 2.1327, 'grad_norm': 8.144155502319336, 'learning_rate': 1.5180722891566266e-05, 'epoch': 0.72}


 25%|██▌       | 190/747 [09:43<19:32,  2.10s/it]

{'loss': 2.0778, 'grad_norm': 5.815507411956787, 'learning_rate': 1.4912985274431058e-05, 'epoch': 0.76}


 27%|██▋       | 200/747 [10:05<17:53,  1.96s/it]

{'loss': 1.9581, 'grad_norm': 7.544180870056152, 'learning_rate': 1.4645247657295851e-05, 'epoch': 0.8}


 28%|██▊       | 210/747 [10:26<17:32,  1.96s/it]

{'loss': 1.9224, 'grad_norm': 5.469730377197266, 'learning_rate': 1.4377510040160642e-05, 'epoch': 0.84}


 29%|██▉       | 220/747 [10:49<18:47,  2.14s/it]

{'loss': 1.7534, 'grad_norm': 6.203818321228027, 'learning_rate': 1.4109772423025437e-05, 'epoch': 0.88}


 31%|███       | 230/747 [11:14<22:45,  2.64s/it]

{'loss': 1.849, 'grad_norm': 8.506776809692383, 'learning_rate': 1.3842034805890228e-05, 'epoch': 0.92}


 32%|███▏      | 240/747 [11:40<23:44,  2.81s/it]

{'loss': 1.8912, 'grad_norm': 4.88901424407959, 'learning_rate': 1.357429718875502e-05, 'epoch': 0.96}


                                                 
 33%|███▎      | 249/747 [12:49<24:24,  2.94s/it]

{'eval_loss': 1.5600042343139648, 'eval_runtime': 42.6709, 'eval_samples_per_second': 11.647, 'eval_steps_per_second': 1.476, 'epoch': 1.0}


 33%|███▎      | 250/747 [12:57<2:24:56, 17.50s/it]

{'loss': 1.8898, 'grad_norm': 9.945436477661133, 'learning_rate': 1.3306559571619812e-05, 'epoch': 1.0}


 35%|███▍      | 260/747 [13:23<24:02,  2.96s/it]  

{'loss': 1.5612, 'grad_norm': 4.561019420623779, 'learning_rate': 1.3038821954484606e-05, 'epoch': 1.04}


 36%|███▌      | 270/747 [13:49<21:56,  2.76s/it]

{'loss': 1.4639, 'grad_norm': 12.285011291503906, 'learning_rate': 1.2771084337349398e-05, 'epoch': 1.08}


 37%|███▋      | 280/747 [14:13<16:59,  2.18s/it]

{'loss': 1.4311, 'grad_norm': 4.894325256347656, 'learning_rate': 1.250334672021419e-05, 'epoch': 1.12}


 39%|███▉      | 290/747 [14:37<17:30,  2.30s/it]

{'loss': 1.7725, 'grad_norm': 8.483829498291016, 'learning_rate': 1.2235609103078983e-05, 'epoch': 1.16}


 40%|████      | 300/747 [15:02<18:26,  2.48s/it]

{'loss': 1.6229, 'grad_norm': 11.671814918518066, 'learning_rate': 1.1967871485943776e-05, 'epoch': 1.2}


 41%|████▏     | 310/747 [15:28<17:56,  2.46s/it]

{'loss': 1.4384, 'grad_norm': 5.945573806762695, 'learning_rate': 1.1700133868808567e-05, 'epoch': 1.24}


 43%|████▎     | 320/747 [16:06<22:34,  3.17s/it]

{'loss': 1.2737, 'grad_norm': 4.494194984436035, 'learning_rate': 1.1432396251673362e-05, 'epoch': 1.29}


 44%|████▍     | 330/747 [16:30<15:50,  2.28s/it]

{'loss': 1.3065, 'grad_norm': 3.8785953521728516, 'learning_rate': 1.1164658634538153e-05, 'epoch': 1.33}


 46%|████▌     | 340/747 [16:53<14:40,  2.16s/it]

{'loss': 1.3543, 'grad_norm': 6.019114971160889, 'learning_rate': 1.0896921017402946e-05, 'epoch': 1.37}


 47%|████▋     | 350/747 [17:19<16:54,  2.56s/it]

{'loss': 1.4024, 'grad_norm': 8.464887619018555, 'learning_rate': 1.0629183400267737e-05, 'epoch': 1.41}


 48%|████▊     | 360/747 [17:44<14:54,  2.31s/it]

{'loss': 1.1916, 'grad_norm': 4.981655597686768, 'learning_rate': 1.0361445783132531e-05, 'epoch': 1.45}


 50%|████▉     | 370/747 [18:29<30:16,  4.82s/it]

{'loss': 1.4297, 'grad_norm': 5.337099075317383, 'learning_rate': 1.0093708165997322e-05, 'epoch': 1.49}


 51%|█████     | 380/747 [19:16<35:28,  5.80s/it]

{'loss': 1.1654, 'grad_norm': 3.564270257949829, 'learning_rate': 9.825970548862117e-06, 'epoch': 1.53}


 52%|█████▏    | 390/747 [19:40<13:04,  2.20s/it]

{'loss': 1.4127, 'grad_norm': 11.199549674987793, 'learning_rate': 9.558232931726908e-06, 'epoch': 1.57}


 54%|█████▎    | 400/747 [20:05<14:05,  2.44s/it]

{'loss': 1.1419, 'grad_norm': 6.073830604553223, 'learning_rate': 9.2904953145917e-06, 'epoch': 1.61}


 55%|█████▍    | 410/747 [20:30<12:37,  2.25s/it]

{'loss': 1.1297, 'grad_norm': 4.942845821380615, 'learning_rate': 9.022757697456494e-06, 'epoch': 1.65}


 56%|█████▌    | 420/747 [20:54<12:18,  2.26s/it]

{'loss': 1.1869, 'grad_norm': 5.375309467315674, 'learning_rate': 8.755020080321286e-06, 'epoch': 1.69}


 58%|█████▊    | 430/747 [21:17<12:18,  2.33s/it]

{'loss': 1.1358, 'grad_norm': 6.109898567199707, 'learning_rate': 8.48728246318608e-06, 'epoch': 1.73}


 59%|█████▉    | 440/747 [21:39<10:49,  2.12s/it]

{'loss': 1.0033, 'grad_norm': 5.252419948577881, 'learning_rate': 8.21954484605087e-06, 'epoch': 1.77}


 60%|██████    | 450/747 [22:16<14:55,  3.01s/it]

{'loss': 0.8987, 'grad_norm': 15.547806739807129, 'learning_rate': 7.951807228915663e-06, 'epoch': 1.81}


 62%|██████▏   | 460/747 [22:39<09:58,  2.08s/it]

{'loss': 1.1064, 'grad_norm': 4.529850482940674, 'learning_rate': 7.684069611780456e-06, 'epoch': 1.85}


 63%|██████▎   | 470/747 [23:02<09:51,  2.14s/it]

{'loss': 1.0841, 'grad_norm': 6.718620777130127, 'learning_rate': 7.416331994645248e-06, 'epoch': 1.89}


 64%|██████▍   | 480/747 [23:23<09:29,  2.13s/it]

{'loss': 1.121, 'grad_norm': 10.236660957336426, 'learning_rate': 7.148594377510041e-06, 'epoch': 1.93}


 66%|██████▌   | 490/747 [23:47<10:12,  2.38s/it]

{'loss': 1.178, 'grad_norm': 7.194690227508545, 'learning_rate': 6.880856760374834e-06, 'epoch': 1.97}


                                                 
 67%|██████▋   | 498/747 [24:44<20:04,  4.84s/it]

{'eval_loss': 0.9441428780555725, 'eval_runtime': 29.6372, 'eval_samples_per_second': 16.769, 'eval_steps_per_second': 2.126, 'epoch': 2.0}


 67%|██████▋   | 500/747 [24:54<45:14, 10.99s/it]

{'loss': 1.0395, 'grad_norm': 4.111725807189941, 'learning_rate': 6.6131191432396255e-06, 'epoch': 2.01}


 68%|██████▊   | 510/747 [25:28<18:46,  4.75s/it]

{'loss': 1.0298, 'grad_norm': 4.505220890045166, 'learning_rate': 6.345381526104418e-06, 'epoch': 2.05}


 70%|██████▉   | 520/747 [25:55<08:54,  2.36s/it]

{'loss': 1.0806, 'grad_norm': 4.324895858764648, 'learning_rate': 6.07764390896921e-06, 'epoch': 2.09}


 71%|███████   | 530/747 [26:22<10:42,  2.96s/it]

{'loss': 1.0216, 'grad_norm': 7.142545223236084, 'learning_rate': 5.809906291834003e-06, 'epoch': 2.13}


 72%|███████▏  | 540/747 [26:47<07:30,  2.18s/it]

{'loss': 1.0722, 'grad_norm': 7.4947099685668945, 'learning_rate': 5.542168674698796e-06, 'epoch': 2.17}


 74%|███████▎  | 550/747 [27:07<06:10,  1.88s/it]

{'loss': 0.9325, 'grad_norm': 9.543574333190918, 'learning_rate': 5.274431057563588e-06, 'epoch': 2.21}


 75%|███████▍  | 560/747 [27:29<06:59,  2.24s/it]

{'loss': 1.1236, 'grad_norm': 4.833688259124756, 'learning_rate': 5.006693440428381e-06, 'epoch': 2.25}


 76%|███████▋  | 570/747 [28:04<09:37,  3.26s/it]

{'loss': 0.9468, 'grad_norm': 5.595343112945557, 'learning_rate': 4.7389558232931736e-06, 'epoch': 2.29}


 78%|███████▊  | 580/747 [28:39<07:31,  2.70s/it]

{'loss': 0.8838, 'grad_norm': 7.898983955383301, 'learning_rate': 4.4712182061579655e-06, 'epoch': 2.33}


 79%|███████▉  | 590/747 [29:37<13:52,  5.30s/it]

{'loss': 0.7982, 'grad_norm': 5.26446008682251, 'learning_rate': 4.203480589022758e-06, 'epoch': 2.37}


 80%|████████  | 600/747 [30:02<05:53,  2.41s/it]

{'loss': 0.9908, 'grad_norm': 12.495499610900879, 'learning_rate': 3.93574297188755e-06, 'epoch': 2.41}


 82%|████████▏ | 610/747 [30:29<05:26,  2.39s/it]

{'loss': 0.8156, 'grad_norm': 10.259331703186035, 'learning_rate': 3.668005354752343e-06, 'epoch': 2.45}


 83%|████████▎ | 620/747 [30:50<04:32,  2.14s/it]

{'loss': 0.8867, 'grad_norm': 4.740422248840332, 'learning_rate': 3.4002677376171355e-06, 'epoch': 2.49}


 84%|████████▍ | 630/747 [31:17<04:53,  2.50s/it]

{'loss': 0.8602, 'grad_norm': 3.0805633068084717, 'learning_rate': 3.132530120481928e-06, 'epoch': 2.53}


 86%|████████▌ | 640/747 [31:42<04:33,  2.56s/it]

{'loss': 0.8778, 'grad_norm': 8.917158126831055, 'learning_rate': 2.8647925033467208e-06, 'epoch': 2.57}


 87%|████████▋ | 650/747 [32:17<05:18,  3.29s/it]

{'loss': 0.8983, 'grad_norm': 14.524096488952637, 'learning_rate': 2.597054886211513e-06, 'epoch': 2.61}


 88%|████████▊ | 660/747 [33:29<09:29,  6.54s/it]

{'loss': 0.8877, 'grad_norm': 2.9894561767578125, 'learning_rate': 2.3293172690763055e-06, 'epoch': 2.65}


 90%|████████▉ | 670/747 [33:55<03:05,  2.41s/it]

{'loss': 0.8219, 'grad_norm': 22.57475471496582, 'learning_rate': 2.061579651941098e-06, 'epoch': 2.69}


 91%|█████████ | 680/747 [34:24<02:58,  2.67s/it]

{'loss': 0.8339, 'grad_norm': 8.913843154907227, 'learning_rate': 1.7938420348058905e-06, 'epoch': 2.73}


 92%|█████████▏| 690/747 [34:50<02:25,  2.56s/it]

{'loss': 0.7257, 'grad_norm': 8.283496856689453, 'learning_rate': 1.526104417670683e-06, 'epoch': 2.77}


 94%|█████████▎| 700/747 [35:15<02:05,  2.66s/it]

{'loss': 0.9058, 'grad_norm': 5.596254825592041, 'learning_rate': 1.2583668005354755e-06, 'epoch': 2.81}


 95%|█████████▌| 710/747 [35:39<01:31,  2.48s/it]

{'loss': 0.6644, 'grad_norm': 3.5727717876434326, 'learning_rate': 9.906291834002677e-07, 'epoch': 2.85}


 96%|█████████▋| 720/747 [36:18<02:07,  4.73s/it]

{'loss': 0.8931, 'grad_norm': 7.4082231521606445, 'learning_rate': 7.228915662650602e-07, 'epoch': 2.89}


 98%|█████████▊| 730/747 [36:43<00:42,  2.48s/it]

{'loss': 0.8498, 'grad_norm': 8.580465316772461, 'learning_rate': 4.5515394912985277e-07, 'epoch': 2.93}


 99%|█████████▉| 740/747 [37:08<00:15,  2.28s/it]

{'loss': 0.7324, 'grad_norm': 4.256271839141846, 'learning_rate': 1.8741633199464527e-07, 'epoch': 2.97}


                                                 
100%|██████████| 747/747 [37:52<00:00,  2.18s/it]

{'eval_loss': 0.8317880630493164, 'eval_runtime': 28.9059, 'eval_samples_per_second': 17.194, 'eval_steps_per_second': 2.179, 'epoch': 3.0}


100%|██████████| 747/747 [37:54<00:00,  3.05s/it]

{'train_runtime': 2275.0019, 'train_samples_per_second': 2.62, 'train_steps_per_second': 0.328, 'train_loss': 1.5812881768467915, 'epoch': 3.0}





TrainOutput(global_step=747, training_loss=1.5812881768467915, metrics={'train_runtime': 2275.0019, 'train_samples_per_second': 2.62, 'train_steps_per_second': 0.328, 'train_loss': 1.5812881768467915, 'epoch': 3.0})

In [46]:
# Save the model
trainer.save_model('./resume_category_model')

In [41]:
from datasets import load_metric

# Load accuracy metric
metric = load_metric("accuracy")

# Define the compute_metrics function
def compute_metrics(p):
    predictions = p.predictions.argmax(axis=-1)  # Get predicted class indices
    return metric.compute(predictions=predictions, references=p.label_ids)

# Initialize Trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print(f"Evaluation results: {eval_results}")

# Access the accuracy
accuracy = eval_results['eval_accuracy']
print(f"Accuracy: {accuracy:.4f}")


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 63/63 [00:32<00:00,  1.96it/s]

Evaluation results: {'eval_loss': 0.8317880630493164, 'eval_accuracy': 0.8591549295774648, 'eval_runtime': 40.7013, 'eval_samples_per_second': 12.211, 'eval_steps_per_second': 1.548}
Accuracy: 0.8592





        Testing Example

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('./resume_category_model')
tokenizer = AutoTokenizer.from_pretrained('./resume_category_model')

# Define the category labels (update these based on your actual labels)
categories = [
    "Accountant", "Advocate", "Agriculture", "Apparell", "Arts", 
    "Automobile", "Aviation", "Banking", "BPO", "Business Development",
    "Chef", "Construction", "Consultant", "Designer", "Digital Media",
    "Engineering", "Finance", "Fitness", "Healthcare", "HR",
    "Information Technology", "Public Relations", "Sales", "Teacher"
]

def preprocess_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    return inputs

def predict(text):
    inputs = preprocess_text(text)

    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions.item()

def get_category_name(label_id):
    return categories[label_id]

# Example resume text
example_text = "Auditing and getting finances "
predicted_label_id = predict(example_text)
predicted_category = get_category_name(predicted_label_id)
print(f"The predicted category is: {predicted_category}")


The predicted category is: Banking
