In [1]:
!pip install transformers datasets accelerate evaluate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import string
import evaluate
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from datasets import Dataset, load_dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer,DataCollatorWithPadding,AutoModelForSequenceClassification, TrainingArguments, Trainer,pipeline

In [3]:
df=pd.read_csv('/content/drive/MyDrive/Resume Data/Resume.csv')
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
def preprocess(example):
  example=' '.join(example.lower().split())
  translation_table = str.maketrans('', '', string.punctuation)
  cleaned_string = example.translate(translation_table)
  example=''.join(cleaned_string.split("－"))
  return example

In [5]:
df['Resume_str']=df['Resume_str'].map(preprocess)

In [6]:
df.drop(['ID','Resume_html'],axis=1,inplace=True)
df.columns=['text','label']
df.head()

Unnamed: 0,text,label
0,hr administratormarketing associate hr adminis...,HR
1,hr specialist us hr operations summary versati...,HR
2,hr director summary over 20 years experience i...,HR
3,hr specialist summary dedicated driven and dyn...,HR
4,hr manager skill highlights hr skills hr depar...,HR


In [7]:
class2id = {class_:id for id, class_ in enumerate(df['label'].unique())}
id2class = {id:class_ for class_, id in class2id.items()}

In [8]:
df['label']=df['label'].map(class2id)

In [9]:
df.head()

Unnamed: 0,text,label
0,hr administratormarketing associate hr adminis...,0
1,hr specialist us hr operations summary versati...,0
2,hr director summary over 20 years experience i...,0
3,hr specialist summary dedicated driven and dyn...,0
4,hr manager skill highlights hr skills hr depar...,0


In [10]:
train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)
test_df, validation_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [11]:
train = train_df.to_dict(orient='list')
test = test_df.to_dict(orient='list')
validation = validation_df.to_dict(orient='list')

In [12]:
custom_dataset_train= Dataset.from_dict(train)
custom_dataset_test= Dataset.from_dict(test)
custom_dataset_validation= Dataset.from_dict(validation)

In [13]:
dataset_dict={'train':custom_dataset_train,'test':custom_dataset_test,'validation':custom_dataset_validation}

In [14]:
custom_dataset= DatasetDict(dataset_dict)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [16]:
def tokenization(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True)

In [17]:
tokenized_custom_dataset = custom_dataset.map(tokenization, batched=True)

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=24, id2label=id2class, label2id=class2id)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="job_classification",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
)

In [20]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_custom_dataset["train"],
    eval_dataset=tokenized_custom_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.0208,2.655298,0.561368
2,2.2337,1.732931,0.780684
3,1.4745,1.149205,0.841046
4,1.0285,0.854499,0.851107
5,0.7548,0.714011,0.857143
6,0.5907,0.627397,0.861167
7,0.4588,0.611805,0.859155
8,0.3837,0.592365,0.861167
9,0.3141,0.604601,0.853119
10,0.271,0.610016,0.849095


TrainOutput(global_step=3740, training_loss=0.6022422178543825, metrics={'train_runtime': 1862.3543, 'train_samples_per_second': 16.001, 'train_steps_per_second': 2.008, 'total_flos': 3949077248409600.0, 'train_loss': 0.6022422178543825, 'epoch': 20.0})

In [23]:
model.save_pretrained("bert-resume-classification")
tokenizer.save_pretrained("tokenizer")
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")
model = AutoModelForSequenceClassification.from_pretrained("bert-resume-classification")

In [27]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [28]:
def calculate_metric_on_test_ds(dataset, model, tokenizer,batch_size=16, column_text="text",column_summary="label"):
    predictions=[]
    labels=[]
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=512, padding=True,truncation=True, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
        for tensor in logits:
          predictions.append(tensor.argmax().item())
        labels.extend(target_batch)
    return predictions,labels

In [29]:
predictions,labels=calculate_metric_on_test_ds(custom_dataset['test'],model,tokenizer)

100%|██████████| 32/32 [08:34<00:00, 16.09s/it]


In [30]:
accuracy_score(labels, predictions)

0.8329979879275654

In [31]:
predictions

[6,
 18,
 3,
 16,
 1,
 7,
 12,
 6,
 10,
 14,
 19,
 2,
 17,
 1,
 21,
 21,
 5,
 17,
 18,
 1,
 6,
 15,
 3,
 17,
 15,
 3,
 21,
 19,
 7,
 0,
 1,
 3,
 17,
 22,
 14,
 19,
 5,
 7,
 3,
 6,
 1,
 5,
 23,
 10,
 19,
 2,
 7,
 5,
 22,
 18,
 12,
 19,
 18,
 13,
 18,
 18,
 19,
 19,
 6,
 11,
 7,
 7,
 20,
 7,
 16,
 21,
 18,
 17,
 16,
 2,
 6,
 16,
 4,
 23,
 12,
 21,
 19,
 5,
 19,
 6,
 10,
 0,
 5,
 18,
 12,
 22,
 12,
 7,
 23,
 20,
 13,
 18,
 4,
 19,
 16,
 4,
 9,
 15,
 6,
 4,
 4,
 1,
 15,
 12,
 17,
 3,
 4,
 21,
 23,
 4,
 15,
 4,
 20,
 14,
 8,
 14,
 14,
 2,
 18,
 12,
 20,
 16,
 11,
 22,
 8,
 14,
 21,
 19,
 11,
 11,
 3,
 2,
 2,
 10,
 2,
 16,
 7,
 17,
 22,
 0,
 15,
 14,
 0,
 15,
 23,
 16,
 19,
 11,
 8,
 18,
 19,
 21,
 14,
 20,
 4,
 1,
 12,
 19,
 0,
 6,
 22,
 12,
 8,
 3,
 16,
 1,
 12,
 22,
 4,
 21,
 21,
 19,
 19,
 4,
 11,
 21,
 19,
 2,
 11,
 7,
 17,
 3,
 6,
 6,
 10,
 22,
 22,
 5,
 19,
 4,
 15,
 7,
 10,
 17,
 16,
 2,
 15,
 21,
 6,
 7,
 16,
 12,
 19,
 3,
 22,
 20,
 2,
 18,
 12,
 5,
 10,
 2,
 17,
 1,
 17,
 2,
 2,
 