In [1]:
# === Step 1: Import libraries === 
from sentence_transformers import SentenceTransformer, util 
import numpy as np
import pandas as pd
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import string
import torch

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# === Step 2: Define competency framework (blocks) ===
block_path = os.path.join("..", "Data", "Competency_block.csv")
block_df = pd.read_csv(block_path)

block_df.head()

Unnamed: 0,Job,Competences
0,Data Analyst,"data analyst, data, analytics, data cleaning, ..."
1,Data Scientist,"data scientist, data science, data, machine le..."
2,Machine Learning Engineer,"machine learning engineer, ML engineer, machin..."
3,NLP Engineer,"nlp engineer, natural language processing, NLP..."
4,Computer Vision Engineer,"computer vision engineer, CV engineer, compute..."


In [3]:
# Transform in dictionnary
block_dict = block_df.set_index('Job').to_dict('index')

# Séparer les compétences en liste
block_dict = {job: val['Competences'].split(', ') for job, val in block_dict.items()}

In [4]:
print(block_dict)

{'Data Analyst': ['data analyst', 'data', 'analytics', 'data cleaning', 'data visualization', 'statistics', 'SQL', 'Power BI', 'Excel', 'Python', 'dashboards', 'attention to detail', 'communication', 'problem solving', 'critical thinking', 'teamwork'], 'Data Scientist': ['data scientist', 'data science', 'data', 'machine learning', 'model evaluation', 'feature engineering', 'Python', 'statistics', 'data wrangling', 'deep learning', 'analytical thinking', 'curiosity', 'experimentation', 'collaboration'], 'Machine Learning Engineer': ['machine learning engineer', 'ML engineer', 'machine learning', 'model deployment', 'MLOps', 'TensorFlow', 'PyTorch', 'model optimization', 'cloud ML', 'data pipelines', 'problem solving', 'innovation', 'adaptability', 'teamwork'], 'NLP Engineer': ['nlp engineer', 'natural language processing', 'NLP', 'transformers', 'BERT', 'text classification', 'tokenization', 'semantic similarity', 'language models', 'creativity', 'linguistic intuition', 'attention to d

In [5]:
# === Step 3: load the user input ===
id_path = os.path.join("..", "Data", "id.txt")
with open(id_path, "r") as f:
    last_id = int(f.read().strip())

user_input_path = os.path.join("..", "Data", "User_input", f"{last_id}_profile.csv")
user_input_df = pd.read_csv(user_input_path)

user_input_df.head()

Unnamed: 0,experiences,interests,qualities,python_level,sql_level,html_level,css_level,hadoop_level,cloud_level
0,I have designed and led software architecture ...,I’m interested in optimizing software scalabil...,"Strategic thinking, leadership, problem-solvin...",1,1,5,5,5,5


In [6]:
# === Step 4: Cleaning the user input ===
column_cleaning = user_input_df.columns

#List of stop words
stop_words = set(stopwords.words('english'))

for col in column_cleaning:
    # To lower case
    user_input_df[col] = user_input_df[col].astype(str).str.lower()
    
    #Delete punctuation
    user_input_df[col] = user_input_df[col].apply(
        lambda x: re.sub(f"[{string.punctuation}]", " ", x)
    )
    
    #Delete stopwords
    user_input_df[col] = user_input_df[col].apply(
        lambda x: " ".join([word for word in x.split() if word not in stop_words])
    )
    
user_input_df.head()

Unnamed: 0,experiences,interests,qualities,python_level,sql_level,html_level,css_level,hadoop_level,cloud_level
0,designed led software architecture scalable sy...,i’m interested optimizing software scalability...,strategic thinking leadership problem solving ...,1,1,5,5,5,5


In [7]:
# Transform user input in list
user_input = []

for col in column_cleaning:
    user_input.append(user_input_df[col].iloc[0])

print(user_input)

['designed led software architecture scalable systems using microservices api driven designs collaborate closely development teams ensure reliability security performance across cloud based infrastructures', 'i’m interested optimizing software scalability cloud architecture improving development workflows mentoring teams best design practices', 'strategic thinking leadership problem solving communication foresight technical depth', '1', '1', '5', '5', '5', '5']


In [None]:
# === Step 4: Load SBERT model for embeddings === 
# tester plusieurs models
model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")  # Model optimised for semantic analysis

# Helper : encode proprement un texte en 2D tensor normalisé
def encode_text(text):
    emb = model.encode(text, convert_to_tensor=True)
    emb = emb.unsqueeze(0) if emb.dim() == 1 else emb
    return emb

# Encode parties textuelles principales
xp_embeddings = encode_text(user_input[0]).mean(dim=0)          # expérience
interet_embeddings = encode_text(user_input[1]).mean(dim=0)     # intérêts
qual_embeddings = encode_text(user_input[2]).mean(dim=0)        # qualités

# Pondération des parties principales (texte)
xp_w, interet_w, qual_w = 0.1, 0.2, 0.025
user_embeddings = xp_w * xp_embeddings + interet_w * interet_embeddings + qual_w * qual_embeddings


# Encode chaque skill individuel
skills = ["python", "sql", "html", "css", "hadoop", "cloud"]
selected_skills = []

for skill in skills:
    try:
        level = int(user_input_df[f"{skill}_level"].iloc[0])  # conversion en int
    except (ValueError, TypeError):
        level = 0
    if level > 0:
        selected_skills.extend([skill] * level)  # répéter le mot selon le niveau

if selected_skills:
    skills_text = " ".join(selected_skills)
    skill_embeddings_user = encode_text(skills_text).mean(dim=0)
    user_embeddings = user_embeddings + skill_embeddings_user

# Normalisation finale
user_embeddings = torch.nn.functional.normalize(user_embeddings, p=2, dim=0)


tensor([ 3.3242e-03, -2.7405e-02, -8.1324e-02, -2.2240e-02,  2.2048e-02,
        -1.0248e-02,  2.3623e-02,  4.1710e-02,  1.6754e-02,  3.3481e-02,
        -8.6894e-04,  1.5232e-02,  1.4873e-02,  6.1355e-02,  5.7656e-02,
        -5.2206e-02,  2.8484e-02,  2.0679e-02, -4.4368e-02, -1.2524e-02,
        -2.1733e-02,  1.9150e-02, -1.0194e-02, -1.2623e-02, -3.7936e-03,
         3.3594e-03, -3.4567e-02,  6.2889e-02, -4.2424e-02, -1.2956e-02,
        -1.9595e-02,  1.6870e-02, -4.7724e-02,  4.8805e-02, -1.9027e-05,
        -3.0423e-02, -2.9495e-02, -6.0731e-03, -5.4961e-02,  6.2270e-03,
        -3.4258e-03,  1.4595e-02, -4.2750e-03, -4.3347e-02, -1.8973e-02,
        -1.8267e-02,  2.0362e-04,  3.4545e-03,  7.3118e-02,  2.9631e-02,
         6.4787e-02,  2.7186e-03, -5.0140e-02,  2.5911e-03, -4.7005e-02,
         6.2094e-03, -2.0321e-02,  5.0375e-02,  8.2401e-02, -1.4009e-02,
        -1.4501e-03,  6.5035e-02, -5.3187e-02, -4.7464e-03, -2.1544e-02,
        -8.5656e-03,  9.3390e-03, -2.0636e-02,  3.7

In [None]:
# === Step 5: Calculate semantic similarity for each block === 
block_scores = {} 

for block, competencies in block_dict.items(): 
    # Encode competency block phrases 
    block_embeddings = model.encode(competencies, convert_to_tensor=True) 
     
    # Compare each user input to competencies using cosine similarity 
    similarities = util.cos_sim(user_embeddings, block_embeddings) 

    # Take mean similarity per user input and average across inputs 
    mean_similarities = [float(sim.mean()) for sim in similarities]   
    # block_score = np.mean(max_similarities)    # block_score = np.mean(max_similarities)
    
    block_scores[block] = mean_similarities


# Obtain top 3 job similarity
top_3_blocks = sorted(block_scores.items(), key=lambda x: x[1], reverse=True)

print(top_3_blocks)

[('Data Analyst', [0.4503689706325531]), ('Machine Learning Engineer', [0.44059568643569946]), ('Backend Developer', [0.43886247277259827]), ('Data Engineer', [0.43550118803977966]), ('Data Scientist', [0.43245577812194824]), ('Frontend Developer', [0.4292033612728119]), ('Database Administrator', [0.42364129424095154]), ('Software Architect', [0.4234912693500519]), ('Business Intelligence Developer', [0.41630879044532776]), ('DevOps Engineer', [0.4145994186401367]), ('Cloud Engineer', [0.41342851519584656]), ('Web Designer', [0.408401221036911]), ('UX/UI Designer', [0.39221593737602234]), ('Full Stack Engineer', [0.3856545388698578]), ('NLP Engineer', [0.37821438908576965]), ('Mobile Developer', [0.37151166796684265]), ('AI Researcher', [0.36937376856803894]), ('Computer Vision Engineer', [0.36875468492507935]), ('Cybersecurity Specialist', [0.35414400696754456]), ('Product Manager', [0.3531849980354309])]
