<a href="https://colab.research.google.com/github/t84210016-oss/77/blob/main/job_intent_training_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets accelerate scikit-learn pandas torch

In [2]:

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, f1_score

In [3]:

URL = "https://raw.githubusercontent.com/t84210016-oss/77/refs/heads/main/posts_1000_diverse.csv"
df = pd.read_csv(URL)

label_map = {"job_offer": 1, "job_seeker": 0}
df["label"] = df["target"].map(label_map)

df = df[["post", "label"]]
df.head()

Unnamed: 0,post,label
0,DataForge recrute un(e) stagiaire PFE Data Sci...,1
1,IBM is seeking a software engineer to join our...,1
2,Je m'appelle Amine √É¬©tudiant en 3√É¬®me ann√É¬©e ...,0
3,Oracle is looking for a cloud engineer to join...,1
4,Hello my name is Yassine and I'm a web develo...,0


In [4]:

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(
    test_size=0.2,
    seed=42
)

dataset

DatasetDict({
    train: Dataset({
        features: ['post', 'label'],
        num_rows: 722
    })
    test: Dataset({
        features: ['post', 'label'],
        num_rows: 181
    })
})

In [5]:

MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:

def tokenize(batch):
    return tokenizer(
        batch["post"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [7]:

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(["post"])
dataset.set_format("torch")

dataset

Map:   0%|          | 0/722 [00:00<?, ? examples/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 722
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 181
    })
})

In [8]:

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

In [10]:

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)

In [11]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [12]:

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.00207,1.0,1.0
2,0.188400,0.000376,1.0,1.0
3,0.001300,0.000228,1.0,1.0
4,0.000500,0.00014,1.0,1.0
5,0.000400,0.000124,1.0,1.0
6,0.000300,0.000109,1.0,1.0
7,0.000200,0.000112,1.0,1.0
8,0.000200,9.3e-05,1.0,1.0
9,0.000200,9.1e-05,1.0,1.0
10,0.000200,8.7e-05,1.0,1.0


TrainOutput(global_step=460, training_loss=0.020843357444265048, metrics={'train_runtime': 916.3691, 'train_samples_per_second': 7.879, 'train_steps_per_second': 0.502, 'total_flos': 474915454924800.0, 'train_loss': 0.020843357444265048, 'epoch': 10.0})

In [17]:

trainer.evaluate()

{'eval_loss': 8.731401612749323e-05,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 1.2115,
 'eval_samples_per_second': 149.405,
 'eval_steps_per_second': 9.905,
 'epoch': 10.0}

In [14]:

def predict(text):
    # Detect device of the model (CPU or GPU)
    device = next(model.parameters()).device

    # Tokenize and move inputs to the same device as the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Get probabilities and predicted label
    probs = torch.softmax(outputs.logits, dim=1)
    label = torch.argmax(probs).item()

    return "hiring" if label == 1 else "job_seeker"

# Test predictions
print(predict("Recherche d√©veloppeur React pour startup"))
print(predict("Disponible pour un poste de data analyst"))
print(predict("Hiring backend engineer ASAP"))

hiring
hiring
hiring


In [15]:

trainer.save_model("job_intent_model")
tokenizer.save_pretrained("job_intent_model")

('job_intent_model/tokenizer_config.json',
 'job_intent_model/special_tokens_map.json',
 'job_intent_model/sentencepiece.bpe.model',
 'job_intent_model/added_tokens.json',
 'job_intent_model/tokenizer.json')

In [16]:
# -----------------------------
# 1Ô∏è‚É£ Install packages
# -----------------------------
!pip install -q transformers torch pandas requests

# -----------------------------
# 2Ô∏è‚É£ Imports
# -----------------------------
import pandas as pd
import requests
import torch
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import files  # for download

# -----------------------------
# 3Ô∏è‚É£ Read JSON from GitHub
# -----------------------------
url = "https://raw.githubusercontent.com/anan181991ba-glitch/000/refs/heads/main/linkedin_emails.json"
data = requests.get(url).json()

# Convert JSON to DataFrame
records = [{"email": email, **info} for email, info in data.items()]
df = pd.DataFrame(records)

# -----------------------------
# 4Ô∏è‚É£ Load your saved model
# -----------------------------
model_path = "job_intent_model"  # replace if different

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# -----------------------------
# 5Ô∏è‚É£ Define prediction function
# -----------------------------
def predict_intent(texts, batch_size=16):
    all_labels = []
    label_map = {0: "job_seeker", 1: "job_offer"}  # adjust if your training labels differ

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1)
            all_labels.extend([label_map[p.item()] for p in preds])
    return all_labels

# -----------------------------
# 6Ô∏è‚É£ Predict
# -----------------------------
posts = df["post_text"].tolist()
df["intent"] = predict_intent(posts)

print(df.head())

# -----------------------------
# 7Ô∏è‚É£ Save to CSV
# -----------------------------
csv_file = "linkedin_email_intent.csv"
df.to_csv(csv_file, index=False)
print(f"‚úÖ Saved CSV: {csv_file}")

# -----------------------------
# 8Ô∏è‚É£ Merge predictions into original JSON
# -----------------------------
# This keeps the original structure and adds "intent"
for _, row in df.iterrows():
    email = row["email"]
    data[email]["intent"] = row["intent"]

# Save updated JSON
json_file = "linkedin_email_intent_full.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Saved JSON with full structure: {json_file}")

# Optional: show first 5 items
for k, v in list(data.items())[:5]:
    print(k, "->", v)

# -----------------------------
# 9Ô∏è‚É£ Download JSON automatically in Colab
# -----------------------------
files.download(json_file)
print("‚úÖ JSON download should start automatically")

The tokenizer you are loading from 'job_intent_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


                            email        keyword  \
0      abdelilah.benhrl@gmail.com  Stage PFE2026   
1     charifachaqri2002@gmail.com  Stage PFE2026   
2             m.habbi@outlook.com  Stage PFE2026   
3  ressources-humaines@casanet.ma  Stage PFE2026   
4           rh@bagile-systems.com  Stage PFE2026   

                                           post_text      intent  
0  üéì Recherche d‚Äôun stage de Projet de Fin d‚Äô√âtud...  job_seeker  
1  √Ä la recherche d‚Äôun stage PFE en √©nergie & √©ne...  job_seeker  
2  √Ä la recherche d‚Äôun Stage de Fin d‚Äô√âtudes (PFE...  job_seeker  
3  üéì Opportunit√© Stage PFE ‚Äì Marketing | F√©vrier ...   job_offer  
4  Salam cher r√©seau, \n\nOn cherche un(e) stagia...   job_offer  
‚úÖ Saved CSV: linkedin_email_intent.csv
‚úÖ Saved JSON with full structure: linkedin_email_intent_full.json
abdelilah.benhrl@gmail.com -> {'keyword': 'Stage PFE2026', 'post_text': 'üéì Recherche d‚Äôun stage de Projet de Fin d‚Äô√âtudes (PFE)\n\nActuelleme

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ JSON download should start automatically
