# Testing the model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("algiraldohe/lm-ner-linkedin-skills-recognition")
model = AutoModelForTokenClassification.from_pretrained("algiraldohe/lm-ner-linkedin-skills-recognition")

# Set model to evaluation mode
model.eval()

# Input text
text = """
We are looking for a software engineer with experience in Python, machine learning, and cloud computing.
Familiarity with Docker, Kubernetes, and RESTful APIs is a plus.
"""

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", truncation=True)

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted token labels
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)

# Convert input IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = predictions[0].numpy()

# Get label names from model's config
id2label = model.config.id2label

# Extract skills
extracted_skills = []
current_skill = ""

for token, label_id in zip(tokens, predicted_labels):
    label = id2label[label_id]
    if label != "O":  # 'O' means no entity
        if token.startswith("##"):  # Handle subword tokenization
            current_skill += token[2:]  # Remove the "##" prefix
        else:
            if current_skill:
                extracted_skills.append(current_skill)
            current_skill = token

# Append the last skill
if current_skill:
    extracted_skills.append(current_skill)

# Display the results
print("Extracted skills:")
for skill in extracted_skills:
    print(skill)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/266M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Extracted skills:
software
python
machine
learning
cloud
computing
kubernetes


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("algiraldohe/lm-ner-linkedin-skills-recognition")
model = AutoModelForTokenClassification.from_pretrained("algiraldohe/lm-ner-linkedin-skills-recognition")

# Set model to evaluation mode
model.eval()

# Input text
text = """
We are looking for a software engineer with experience in Python, machine learning, and cloud computing.
Familiarity with Docker, Kubernetes, and RESTful APIs is a plus.
"""

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", truncation=True)

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted token labels
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)

# Convert input IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = predictions[0].numpy()

# Get label names from model's config
id2label = model.config.id2label

# Create a list of (token, label) pairs
extracted_skills = []
for token, label_id in zip(tokens, predicted_labels):
    label = id2label[label_id]
    if label != "O":  # 'O' means no entity
        extracted_skills.append((token, label))

# Now group tokens with the same label (for skills)
grouped_skills = []
current_skill = ""

for token, label in extracted_skills:
    if label == "B-SKILL":  # Start of a skill
        if current_skill:
            grouped_skills.append(current_skill)
        current_skill = token
    elif label == "I-SKILL":  # Continuation of a skill
        current_skill += " " + token

# Append the last skill
if current_skill:
    grouped_skills.append(current_skill)

# Display grouped skills
print("Grouped Extracted Skills:")
for skill in grouped_skills:
    print(skill)


Grouped Extracted Skills:


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("algiraldohe/lm-ner-linkedin-skills-recognition")
model = AutoModelForTokenClassification.from_pretrained("algiraldohe/lm-ner-linkedin-skills-recognition")

# Set model to evaluation mode
model.eval()

# Input text
text = """
We are looking for a software engineer with experience in Python, machine learning, and cloud computing.
Familiarity with Docker, Kubernetes, and RESTful APIs is a plus.
"""

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", truncation=True)

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted token labels
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)

# Convert input IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = predictions[0].numpy()

# Get label names from model's config
id2label = model.config.id2label

# Create a list of (token, label) pairs
extracted_skills = []
for token, label_id in zip(tokens, predicted_labels):
    label = id2label[label_id]
    if label != "O":  # 'O' means no entity
        # Fix the issue with subword tokens starting with '##'
        if token.startswith("##"):
            token = token[2:]  # Remove the '##' prefix to merge subwords
        extracted_skills.append((token, label))

# Now group tokens with the same label (for skills)
grouped_skills = []
current_skill = ""

for token, label in extracted_skills:
    # Handling TECHNICAL and TECHNOLOGY skills
    if label == "B-TECHNICAL" or label == "B-TECHNOLOGY":  # Start of a skill
        if current_skill:
            grouped_skills.append(current_skill)
        current_skill = token
    elif label == "I-TECHNICAL" or label == "I-TECHNOLOGY":  # Continuation of a skill
        current_skill += " " + token

# Append the last skill
if current_skill:
    grouped_skills.append(current_skill)

# Display grouped skills
print("Grouped Extracted Skills:")
for skill in grouped_skills:
    print(skill)


Grouped Extracted Skills:
software
python
machine learning
cloud computing
ku ber net es


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("GalalEwida/lm-ner-skills-extractor_BERT")
model = AutoModelForTokenClassification.from_pretrained("GalalEwida/lm-ner-skills-extractor_BERT")

# Set model to evaluation mode
model.eval()

# Input text
text = """
We are looking for a software engineer with experience in Python, machine learning, and cloud computing.
Familiarity with Docker, Kubernetes, and RESTful APIs is a plus.
"""

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", truncation=True)

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted token labels
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)

# Convert input IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = predictions[0].numpy()

# Get label names from model's config
id2label = model.config.id2label

# Create a list of (token, label) pairs
extracted_skills = []
for token, label_id in zip(tokens, predicted_labels):
    label = id2label[label_id]
    if label != "O":  # 'O' means no entity
        # Fix the issue with subword tokens starting with '##'
        if token.startswith("##"):
            token = token[2:]  # Remove the '##' prefix to merge subwords
        extracted_skills.append((token, label))

# # Now group tokens with the same label (for skills)
# grouped_skills = []
# current_skill = ""

# for token, label in extracted_skills:
#     if label == "B-SKILL":  # Start of a skill
#         if current_skill:
#             grouped_skills.append(current_skill)
#         current_skill = token
#     elif label == "I-SKILL":  # Continuation of a skill
#         current_skill += " " + token

# # Append the last skill
# if current_skill:
#     grouped_skills.append(current_skill)

# # Display grouped skills
# print("Grouped Extracted Skills:")
# for skill in grouped_skills:
#     print(skill)
for skill in extracted_skills:
    print(skill)


('software', 'B-TECHNICAL')
('Python', 'B-TECHNICAL')
('machine', 'B-TECHNICAL')
('learning', 'I-TECHNICAL')
('cloud', 'B-TECHNICAL')
('computing', 'I-TECHNICAL')


# Fine tuning the model

## Step 1 : Preprocess Initial json, add labels to skills

In [None]:
from transformers import BertTokenizerFast
import json
import re

with open("job_descriptions_202.json", "r") as f:
    data = json.load(f)

def label_tokens_lower(text, skills, tokenizer):
    text = text.lower()
    skills = [s.lower() for s in skills]

    # Find character-level spans of each skill in the lowercased text
    spans = []
    for skill in skills:
        for match in re.finditer(re.escape(skill), text):
            spans.append((match.start(), match.end()))

    # Tokenize the lowercased text
    tokenized = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
    tokens = tokenized.tokens()
    offsets = tokenized["offset_mapping"]

    labels = []
    for (start, end) in offsets:
        label = "O"
        for s_start, s_end in spans:
            if start >= s_start and end <= s_end:
                label = "B-TECHNICAL" if start == s_start else "I-TECHNICAL"
                break
        labels.append(label)

    return {"tokens": tokens, "labels": labels}

# Process all job descriptions
processed = [label_tokens_lower(ad["job_description"], ad["skills"], tokenizer) for ad in data]

# Save as .json
with open("processed_skills_datasets.json", "w", encoding="utf-8") as f:
    json.dump(processed, f, indent=2, ensure_ascii=False)

print("✅ Dataset saved to 'processed_skills_dataset.json' with lowercase tokenization and proper labeling.")

✅ Dataset saved to 'processed_skills_dataset.json' with lowercase tokenization and proper labeling.


## Step 2 : Load the preprocessed json and convert it to hugging face datasets format

In [None]:
from datasets import load_dataset

# Load the dataset from the converted JSON file
dataset = load_dataset("json", data_files="/kaggle/input/job-desc/processed_skills_datasets.json")


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset['train']

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 202
})

## Step 3: Prepare Labels & Tokenizer

In [None]:
from transformers import AutoTokenizer

# Load model tokenizer
tokenizer = AutoTokenizer.from_pretrained("GalalEwida/lm-ner-skills-extractor_BERT")

# Create a label list from the dataset
label_list = sorted({label for row in dataset["train"] for label in row["labels"]})
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
label_list

['B-TECHNICAL', 'I-TECHNICAL', 'O']

## Step 4: Tokenization & Label Alignment

In [None]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()

    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["labels"][word_idx]])
        else:
            labels.append(label2id[example["labels"][word_idx]])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply to dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)


Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 202
    })
})

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=7b31fd67fb4aa0b592b25e9627c7cf36aee355ced087a2bd216741dea98a6aae
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


## Step 5 : Setup model training with Trainer, initialize the model, the data collator, evaluation metrics, training arguments

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback
import numpy as np
import evaluate
model = AutoModelForTokenClassification.from_pretrained(
    "GalalEwida/lm-ner-skills-extractor_BERT",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
training_args = TrainingArguments(
    output_dir="/kaggle/working/skill-ner-model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to="none",
)


config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at GalalEwida/lm-ner-skills-extractor_BERT and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]



In [None]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

In [None]:
model.config.id2label


{0: 'B-TECHNICAL', 1: 'I-TECHNICAL', 2: 'O'}

## Step 6 : Split dataset

In [None]:
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)


## Step 7 : Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)


  trainer = Trainer(


## Step 8 : Train the model

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3706,0.369704,0.361111,0.490566,0.416,0.830415
2,0.2041,0.242304,0.5,0.632075,0.558333,0.902951
3,0.1663,0.183411,0.627907,0.764151,0.689362,0.936468
4,0.1199,0.174524,0.664062,0.801887,0.726496,0.931466
5,0.0668,0.168868,0.716667,0.811321,0.761062,0.941971
6,0.0663,0.138006,0.726562,0.877358,0.794872,0.95948
7,0.0442,0.114481,0.805085,0.896226,0.848214,0.967984
8,0.0241,0.132306,0.780488,0.90566,0.838428,0.96098
9,0.0315,0.124025,0.798319,0.896226,0.844444,0.967484
10,0.0312,0.130429,0.793388,0.90566,0.845815,0.963982


TrainOutput(global_step=210, training_loss=0.11937544771603176, metrics={'train_runtime': 50.7623, 'train_samples_per_second': 31.716, 'train_steps_per_second': 4.137, 'total_flos': 66032635006854.0, 'train_loss': 0.11937544771603176, 'epoch': 10.0})

In [None]:
trainer.save_model("./kaggle/working/final_skill_model")
tokenizer.save_pretrained("./kaggle/working/final_skill_model")

('./kaggle/working/final_skill_model/tokenizer_config.json',
 './kaggle/working/final_skill_model/special_tokens_map.json',
 './kaggle/working/final_skill_model/vocab.txt',
 './kaggle/working/final_skill_model/added_tokens.json',
 './kaggle/working/final_skill_model/tokenizer.json')

## Step 9 : Test the model through inference

In [None]:
def tokenize_input(text):
    return tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        is_split_into_words=False
    )


In [None]:
import torch
def extract_skills(text):
    # Move the model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trainer.model.to(device)  # Ensure the model is on the correct device

    # Tokenize input
    inputs = tokenize_input(text)

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Run the model on the input
    outputs = trainer.model(**inputs)

    # Get predictions
    predictions = torch.argmax(outputs.logits, dim=2)

    # Convert tokens to words and predictions to labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    prediction_labels = [label_list[pred.item()] for pred in predictions[0]]

    # Extract skills based on the B- and I- labels
    skills = []
    current_skill = []

    for token, label in zip(tokens, prediction_labels):
        if label.startswith("B-"):
            if current_skill:
                skills.append(" ".join(current_skill))
                current_skill = []
            current_skill.append(token)
        elif label.startswith("I-") and current_skill:
            current_skill.append(token)
        else:
            if current_skill:
                skills.append(" ".join(current_skill))
                current_skill = []

    if current_skill:
        skills.append(" ".join(current_skill))

    # Clean up tokens (remove ## and fix spacing)
    clean_skills = [tokenizer.convert_tokens_to_string(skill.split()) for skill in skills]
    return clean_skills


In [None]:
text = "Experienced in Python, React Native, and machine learning using PyTorch and TensorFlow."
print(extract_skills(text.lower()))


['python', 'react native', 'machine learning', 'pytorch', 'tensorflow']


In [None]:
text1="We are looking for an experienced Full Stack Developer to join our team. The ideal candidate should have expertise in frontend technologies such as React and Angular, as well as backend development with Node.js and Python. Knowledge of database management systems like MySQL and MongoDB is essential. Additionally, experience in cloud services (e.g., AWS, Azure) and containerization (e.g., Docker, Kubernetes) is highly desirable. The candidate should also be familiar with RESTful APIs, GraphQL, and have hands-on experience in DevOps practices. A solid understanding of agile methodologies, version control systems (e.g., Git), and unit testing frameworks (e.g., Jest, Mocha) will be a plus.We expect the candidate to be comfortable working in CI/CD pipelines and to have experience in monitoring and debugging applications in production environments."

In [None]:
print(extract_skills(text1.lower()))

['react', 'angular', 'node. js', 'python', 'database management', 'mysql', 'mongodb', 'cloud services', 'aws', 'containerization', 'docker', 'kubernetes', 'graphql', 'devops', 'version control', 'git', 'unit testing', 'jest', 'mocha', 'ci', 'cd pipelines', 'debugging']


In [None]:
print(extract_skills(text1.lower()))


['angular', 'python', 'database', 'mysql', 'mongodb', 'cloud services', 'aws', 'containerization', 'kubernetes', 'graphql', 'devops', 'version control', 'git', 'unit testing', 'ci', 'cd pipelines', 'debugging']


In [None]:
text2="Looking for a talented full-stack developer with experience in React, Node.js, Express, and MongoDB to build and maintain web applications. You should be proficient in JavaScript, HTML5, CSS3, and RESTful APIs. Experience with AWS, Docker, and Kubernetes is a plus. Familiarity with version control systems like Git is required. Knowledge of Agile methodologies, CI/CD pipelines, and testing frameworks like Jest or Mocha is essential."

In [None]:
print(extract_skills(text2.lower()))


['react', 'node. js', 'mongodb', 'javascript', 'html5', 'aws', 'kubernetes', 'version control', 'git', 'agile methodologies', 'ci', 'cd pipelines', 'jest', 'mocha']


In [None]:
print(extract_skills(text2.lower()))


['react', 'mongodb', 'javascript', 'html5', 'aws', 'kubernetes', 'version control', 'git', 'agile methodologies', 'ci', 'cd pipelines']


In [None]:
job_description = """
We are looking for a Senior Full Stack Developer with extensive experience in web and mobile application development. The ideal candidate will have strong proficiency in JavaScript frameworks, including React and Angular, and will be well-versed in building scalable web applications. You will be working on cutting-edge technologies and must have experience with cloud services like AWS, Azure, and Google Cloud Platform. In addition, familiarity with DevOps practices, containerization, and microservices is a must.

You should also be comfortable working with databases such as MySQL, PostgreSQL, and MongoDB, and be proficient in both SQL and NoSQL databases. A strong understanding of RESTful APIs, GraphQL, and serverless architectures is essential. Familiarity with agile development practices, continuous integration, and automated testing frameworks like Jest and Mocha is required. Experience with CI/CD pipelines is a must.

In addition to front-end and back-end development, knowledge of infrastructure as code (IAC) tools such as Terraform and CloudFormation, and experience in deployment using Kubernetes and Docker is essential. You should also be experienced in building and deploying applications in a microservices architecture.

Experience with machine learning frameworks like TensorFlow or PyTorch will be a plus, as the company plans to integrate AI-driven features into its products. Additionally, knowledge of version control tools, specifically Git, is required.

Skills:
- Languages: JavaScript, TypeScript, Python, Java
- Frontend: React, Angular, Vue.js, HTML5, CSS3, Tailwind CSS
- Backend: Node.js, Express, Django, Flask, Spring Boot
- Databases: MySQL, PostgreSQL, MongoDB, Redis, Cassandra
- Cloud: AWS, Azure, Google Cloud, Heroku
- Containerization: Docker, Kubernetes
- CI/CD: Jenkins, GitLab CI, CircleCI, Travis CI
- Version Control: Git, GitHub, Bitbucket
- DevOps: Jenkins, Ansible, Puppet, Terraform
- APIs: RESTful APIs, GraphQL, WebSockets
- Testing: Jest, Mocha, Cypress, Selenium
- Machine Learning: TensorFlow, PyTorch, Scikit-learn
- Others: Agile, Scrum, TDD, BDD, Microservices, Serverless, IaC, Nginx, Apache
"""


In [None]:
print(extract_skills(job_description.lower()))


['javascript frameworks', 'react', 'angular', 'cloud services', 'aws', 'google', 'devops', 'containerization', 'microservices', 'mysql', 'postgresql', 'mongodb', 'sql', 'nosql', 'graphql', 'agile development', 'continuous integration', 'automated testing', 'jest', 'mocha', 'ci', 'cd pipelines', 'infrastructure as code', 'iac', 'terraform', 'kubernetes', 'machine learning', 'tensorflow', 'pytorch', 'version control', 'git', 'javascript', 'typescript', 'python', 'react', 'angular', 'html5', 'node. js', 'django']


In [None]:
data_job_description = """
We are looking for a Data Scientist to join our growing team and help us extract valuable insights from large datasets. The ideal candidate will have strong experience with statistical analysis, machine learning algorithms, and data visualization techniques. You should be proficient in Python, R, and SQL, with a deep understanding of data manipulation, data wrangling, and exploratory data analysis (EDA).

You will be responsible for developing predictive models using supervised and unsupervised machine learning techniques, including regression, classification, clustering, and anomaly detection. A solid understanding of advanced algorithms such as decision trees, random forests, gradient boosting, and neural networks is required. Familiarity with deep learning frameworks like TensorFlow, Keras, and PyTorch is a plus.

The role also requires experience in time series analysis and forecasting, as well as working with big data platforms such as Hadoop and Spark. You should be comfortable working with cloud technologies, particularly AWS, Google Cloud, or Azure, and using platforms such as SageMaker or Databricks for model training and deployment.

Experience with version control tools like Git, and the ability to collaborate with cross-functional teams in an agile environment is essential. You will also need to work with business stakeholders to define key metrics, develop dashboards, and communicate findings clearly.

Skills:
- Programming: Python, R, SQL, Bash
- Machine Learning: Scikit-learn, XGBoost, LightGBM, CatBoost, TensorFlow, Keras, PyTorch
- Data Manipulation: Pandas, NumPy, Dask
- Data Visualization: Matplotlib, Seaborn, Plotly, Tableau, PowerBI
- Big Data: Hadoop, Spark, Hive, Pig
- Cloud: AWS, Google Cloud, Azure, Databricks
- Version Control: Git, GitHub, GitLab
- Time Series: ARIMA, Prophet, LSTM, SARIMA
- Statistical Analysis: Hypothesis testing, A/B testing, Bayesian statistics
- Deployment: Flask, FastAPI, Docker, Kubernetes
- Databases: MySQL, PostgreSQL, MongoDB, Cassandra, Redshift
- Other: Agile, Scrum, CI/CD, Data Warehousing, ETL, NLP
"""


In [None]:
print(extract_skills(data_job_description.lower()))


['statistical analysis', 'machine learning', 'data visualization', 'python', 'r', 'sql', 'data manipulation', 'data wrangling', 'exploratory data analysis', 'eda', 'machine learning', 'anomaly detection', 'decision trees', 'random forests', 'gradient boosting', 'deep learning', 'tensorflow', 'keras', 'pytorch', 'time series analysis', 'forecasting', 'big data', 'hadoop', 'aws', 'google cloud', 'databricks', 'model training', 'version control', 'git', 'dashboards', 'python', 'r', 'sql', 'bash', 'lightgbm', 'catboost', 'tensorflow', 'keras', 'pandas', 'numpy', 'matplotlib', 'seaborn', 'plotly', 'tableau', 'hadoop']


In [None]:
import shutil

# Define the model directory path
model_dir = '/kaggle/working/kaggle/working/final_skill_model'

# Specify the destination zip file path
zip_file = '/kaggle/working/final_skill_model.zip'

# Create a zip file of the model directory
shutil.make_archive(zip_file.replace('.zip', ''), 'zip', model_dir)


'/kaggle/working/final_skill_model.zip'