In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Charger le tokenizer BERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Charger le modèle BERT pré-entraîné
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Charger les données
job_descriptions = pd.read_csv("jobs_descriptions.csv")
resume = pd.read_csv("resume.csv")

# Nettoyer le texte
def preprocess_text(text):
    # Convertir en chaînes de caractères
    text = text.astype(str)
    # Convertir en minuscules
    text = text.str.lower()
    # Supprimer les caractères spéciaux
    text = text.str.replace(r'[@#&$%]', '', regex=True)
    text = text.str.replace(r'[,.;]', '', regex=True)
    text = text.apply(lambda x: [word for word in x if word.lower()])
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    text = text.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    # Rejoindre les mots en une seule chaîne
    text = text.apply(lambda x: ' '.join(x))
    return text

# Prétraiter les descriptions d'emploi et les CV
job_descriptions['clean_text'] = preprocess_text(job_descriptions.iloc[:, 2])
resume['clean_text'] = preprocess_text(resume['Resume_str'])

# Fonction pour obtenir les embeddings des phrases avec BERT
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

# Limiter le nombre de CV à 20
num_resumes = 20
# Batch size for processing resumes
batch_size = 10
# Maximum sequence length
max_seq_length = 64
# Définir un seuil de similarité
threshold = 0.7
# Calculate job embeddings once outside the loop
job_embeddings = get_embeddings(job_descriptions['clean_text'].tolist())

# Attribuer les correspondances en fonction de la similarité
matches = []
for i, job_desc in enumerate(job_descriptions['clean_text']):
    for j in range(0, num_resumes, batch_size):
        # Process resumes in batches
        batch_resume_texts = resume['clean_text'].iloc[j:j+batch_size].tolist()
        batch_resume_embeddings = get_embeddings(batch_resume_texts)

        # Calculate similarity for the current batch
        similarity_scores = cosine_similarity(job_embeddings[i:i+1], batch_resume_embeddings)
        
        # Identify matches in the current batch
        for k in range(len(batch_resume_texts)):
            similarity_score = similarity_scores[0, k]
            if similarity_score > threshold:
                matches.append((i, j+k, similarity_score))

        # Clear variables from memory
        del batch_resume_texts, batch_resume_embeddings, similarity_scores

# Afficher les correspondances (limit to a certain number of matches)
num_matches_to_display = min(10, len(matches))
for match in matches[:num_matches_to_display]:
    print("Job Description ID:", match[0], "| Resume ID:", match[1], "| Similarity:", match[2])


Job Description ID: 0 | Resume ID: 0 | Similarity: 0.86533636
Job Description ID: 0 | Resume ID: 1 | Similarity: 0.9673733
Job Description ID: 0 | Resume ID: 2 | Similarity: 0.96490103
Job Description ID: 0 | Resume ID: 3 | Similarity: 0.84148645
Job Description ID: 0 | Resume ID: 4 | Similarity: 0.93803895
Job Description ID: 0 | Resume ID: 5 | Similarity: 0.93672115
Job Description ID: 0 | Resume ID: 6 | Similarity: 0.8829583
Job Description ID: 0 | Resume ID: 7 | Similarity: 0.84338295
Job Description ID: 0 | Resume ID: 8 | Similarity: 0.9190512
Job Description ID: 0 | Resume ID: 9 | Similarity: 0.74939895
