In [26]:
import os
import json

# Đường dẫn tới thư mục chứa các file JSON
folder_path = "/kaggle/input/data001/ResumesJsonAnnotated"
data_ls = []
# Duyệt qua toàn bộ file trong thư mục
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        
        # Đọc nội dung file JSON
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            data_ls.append(data)

In [30]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(text, annotations):
    # tạo mảng nhãn với độ dài bằng số ký tự
    labels = ["O"] * len(text)
    # gán nhãn cho từng ký tự
    for start, end, label in annotations:
        tag = label.split(":")[0]  
        labels[start] = f"B-{tag}" # kí tự bắt đầu là B-SKILL
        for i in range(start + 1, end):
            labels[i] = f"I-{tag}" # kí tự thuộc skill nhưng không phải kí tự bắt đầu sẽ là I-SKILL
    
    # tokenization
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        return_offsets_mapping=True, # Trả về offset mapping (index cho kí tự bắt đầu và kết thúc của 1 token)
        padding="max_length",
        max_length=512
    )

    # ánh xạ từ char-level sang token-level
    offset_mapping = tokenized_inputs.pop("offset_mapping")
    label_ids = []
    for offsets in offset_mapping:
        if offsets[0] == offsets[1]:
            label_ids.append(-100) # các token đặc biệt như [CLS] hay [SEP] sẽ đc gán là -100 
        else:
            label_ids.append(
                0 if all(l == "O" for l in labels[offsets[0]:offsets[1]]) # nếu ko có SKILL → 0
                else 1  # nếu có SKILL → 1
            )

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [32]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Loại bỏ surrogate characters (những ký tự Unicode lỗi)
    return re.sub(r'[\ud800-\udfff]', '', text)


In [33]:
ner_inputs = []
for example in data_ls:
    text = clean_text(example["text"])
    ner_inputs.append(tokenize_and_align_labels(text,example['annotations']))

In [40]:
from datasets import Dataset

# chuyển sang format dataset của huggingface
dataset = Dataset.from_list(ner_inputs)

In [42]:
import os

# vô hiệu hóa WANDB để trainer hoạt động được trên kaggle
os.environ["WANDB_DISABLED"] = "true"

In [43]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# dict ánh xạ qua lại giữa nhãn và chỉ số của nó
label2id = {"O": 0, "B-SKILL": 1, "I-SKILL": 2}
id2label = {v: k for k, v in label2id.items()}

# khởi tạo model với tác vụ TokenClassification (NER), sử dụng BERT
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="./bert-ner-skill", # đường dẫn lưu checkpoint
    eval_strategy="no", # không evaluate model lúc train 
    learning_rate=1e-5,
    per_device_train_batch_size=8, # mỗi GPU train 8 mẫu trong 1 bước huấn luyện
    num_train_epochs=10,
    weight_decay=0.01, # giảm các trọng số có giá trị quá lớn, giúp model bớt overfitting
    logging_steps=20, # in loss mỗi 20 step
    save_strategy="epoch" # lưu model mỗi epoch
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

# huấn luyện model
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
20,0.591
40,0.3191
60,0.2706
80,0.2542
100,0.2319
120,0.2089
140,0.1888
160,0.1902
180,0.1701
200,0.168




TrainOutput(global_step=3150, training_loss=0.07892444163087814, metrics={'train_runtime': 3065.5181, 'train_samples_per_second': 16.405, 'train_steps_per_second': 1.028, 'total_flos': 1.314073269974016e+16, 'train_loss': 0.07892444163087814, 'epoch': 10.0})

In [44]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_dir = "/kaggle/working/bert-ner-skill/checkpoint-3150"  # đường dẫn đã lưu model

# load model đã train
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForTokenClassification.from_pretrained(model_dir)
# xây dựng pipeline, không cần preprocess text để đưa vào model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cuda:0


In [46]:
text = """One97 Communications Limited Data Scientist Jan 2019 to Till Date Detect important information from images and redact required fields. YOLO CNN Object-detection, OCR Insights, find anomaly or performance drop in all possible sub-space. Predict the Insurance claim probability. Estimate the premium amount to be charged B.Tech(Computer Science) from SGBAU university in 2017. M.Tech (Computer Science Engineering) from Indian Institute of Technology (IIT), Kanpur in 2019WORK EXPERIENCE EDUCATIONMACY WILLIAMS DATA SCIENTIST Data Scientist working on problems related to market research and customer analysis. I want to expand my arsenal of application building and work on different kinds of problems. Looking for a role where I can work with a coordinative team and exchange knowledge during the process. Java, C++, Python, Machine Learning, Algorithms, Natural Language Processing, Deep Learning, Computer Vision, Pattern Recognition, Data Science, Data Analysis, Software Engineer, Data Analyst, C, PySpark, Kubeflow.ABOUT SKILLS Customer browsing patterns. Predict potential RTO(Return To Origin) orders for e- commerce. Object Detection.PROJECTS ACTIVITES"""
results = ner_pipeline(text)

for r in results:
    print(f"{r['word']} → {r['entity_group']} (score={r['score']:.2f})")

communications → SKILL (score=1.00)
data → SKILL (score=1.00)
scientist → SKILL (score=1.00)
information → SKILL (score=1.00)
images → SKILL (score=1.00)
fields → SKILL (score=1.00)
cnn → SKILL (score=0.99)
detection → SKILL (score=1.00)
o → SKILL (score=1.00)
##cr → SKILL (score=1.00)
insights → SKILL (score=1.00)
performance → SKILL (score=1.00)
space → SKILL (score=1.00)
insurance → SKILL (score=1.00)
probability → SKILL (score=0.95)
premium → SKILL (score=1.00)
tech → SKILL (score=1.00)
computer → SKILL (score=0.89)
science → SKILL (score=0.90)
m → SKILL (score=0.97)
science → SKILL (score=0.58)
engineering → SKILL (score=1.00)
technology → SKILL (score=1.00)
problems → SKILL (score=1.00)
market → SKILL (score=0.98)
research → SKILL (score=0.99)
customer → SKILL (score=1.00)
analysis → SKILL (score=1.00)
application → SKILL (score=1.00)
building → SKILL (score=1.00)
work → SKILL (score=1.00)
can → SKILL (score=1.00)
team → SKILL (score=1.00)
exchange → SKILL (score=1.00)
knowledge 

In [51]:
# hàm gộp các skill (A, ##B -> AB) và trả về danh sánh skill
def get_output(text):
    results = ner_pipeline(text)
    
    skill_ls = []
    leng = 0
    prev_skill = ""
    for r in results:
        cur_skill = r['word']
        if cur_skill[0] == '#':
            cur_skill = cur_skill[2:]
            skill = prev_skill + cur_skill
            skill_ls[len(skill_ls)-1]=skill
        else:
            skill_ls.append(cur_skill)
        prev_skill = cur_skill
    return skill_ls

In [53]:
text = """One97 Communications Limited Data Scientist Jan 2019 to Till Date Detect important information from images and redact required fields. YOLO CNN Object-detection, OCR Insights, find anomaly or performance drop in all possible sub-space. Predict the Insurance claim probability. Estimate the premium amount to be charged B.Tech(Computer Science) from SGBAU university in 2017. M.Tech (Computer Science Engineering) from Indian Institute of Technology (IIT), Kanpur in 2019WORK EXPERIENCE EDUCATIONMACY WILLIAMS DATA SCIENTIST Data Scientist working on problems related to market research and customer analysis. I want to expand my arsenal of application building and work on different kinds of problems. Looking for a role where I can work with a coordinative team and exchange knowledge during the process. Java, C++, Python, Machine Learning, Algorithms, Natural Language Processing, Deep Learning, Computer Vision, Pattern Recognition, Data Science, Data Analysis, Software Engineer, Data Analyst, C, PySpark, Kubeflow.ABOUT SKILLS Customer browsing patterns. Predict potential RTO(Return To Origin) orders for e- commerce. Object Detection.PROJECTS ACTIVITES"""
skill_ls = get_output(text)

In [54]:
# hàm so sánh giữa candidate và company
def comparision(cd,cp):
    cd_ls = get_output(cd)
    cp_ls = get_output(cp)

    match = []
    not_match = []

    for rq in cp_ls:
        matched = False
        for skill in cd_ls:
            if(rq == skill):
                match.append(rq)
                matched = True
                break
        if(matched == False):
            not_match.append(rq)
    return match, not_match

In [55]:
cp = """User of Bitwarden
Startup experience
Open source experience
Experience in SQL Server, Azure, Node.js, Electron, RabbitMQ, Angular, .NET Core, web browser extensions"""

cd = """ Motivated to build a career in development.
Proactive attitude with curiosity to experiment.
Eagerness to learn modern frameworks such as React, Angular, or Django.
Effective communication skills in English.
Familiarity with GitHub or version control is a plus.
Comfortable with self-paced online learning"""

match, not_match = comparision(cd,cp)

In [56]:
print(match)

['angular']


In [57]:
print(not_match)

['startup', 'experience', 'open', 'source', 'experience', 'sql', 'server', 'azure', 'node', '.', 'js', 'electron', 'mq', '.', 'net', 'core', 'web', 'browser', 'extensions']


In [58]:
import shutil

shutil.make_archive("bert-base-uncased-10-epochs", "zip", "/kaggle/working/bert-ner-skill/checkpoint-3150")

'/kaggle/working/bert-base-uncased-10-epochs.zip'