In [109]:
# === Step 1: Import libraries === 
from sentence_transformers import SentenceTransformer, util 
import numpy as np
import pandas as pd
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import string
import torch

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [110]:
# === Step 2: Define competency framework (blocks) ===
block_path = os.path.join("..", "Data", "Competency_block.csv")
block_df = pd.read_csv(block_path)

block_df.head()

Unnamed: 0,Job,Competences
0,Data Analyst,"data analyst, data, analytics, data cleaning, ..."
1,Data Scientist,"data scientist, data science, data, machine le..."
2,Machine Learning Engineer,"machine learning engineer, ML engineer, machin..."
3,NLP Engineer,"nlp engineer, natural language processing, NLP..."
4,Computer Vision Engineer,"computer vision engineer, CV engineer, compute..."


In [111]:
# Transform in dictionnary
block_dict = block_df.set_index('Job').to_dict('index')

# Séparer les compétences en liste
block_dict = {job: val['Competences'].split(', ') for job, val in block_dict.items()}

In [112]:
print(block_dict)

{'Data Analyst': ['data analyst', 'data', 'analytics', 'data cleaning', 'data visualization', 'statistics', 'SQL', 'Power BI', 'Excel', 'Python', 'dashboards', 'attention to detail', 'communication', 'problem solving', 'critical thinking', 'teamwork'], 'Data Scientist': ['data scientist', 'data science', 'data', 'machine learning', 'model evaluation', 'feature engineering', 'Python', 'statistics', 'data wrangling', 'deep learning', 'analytical thinking', 'curiosity', 'experimentation', 'collaboration'], 'Machine Learning Engineer': ['machine learning engineer', 'ML engineer', 'machine learning', 'model deployment', 'MLOps', 'TensorFlow', 'PyTorch', 'model optimization', 'cloud ML', 'data pipelines', 'problem solving', 'innovation', 'adaptability', 'teamwork'], 'NLP Engineer': ['nlp engineer', 'natural language processing', 'NLP', 'transformers', 'BERT', 'text classification', 'tokenization', 'semantic similarity', 'language models', 'creativity', 'linguistic intuition', 'attention to d

In [113]:
# === Step 3: load the user input ===
id_path = os.path.join("..", "Data", "id.txt")
with open(id_path, "r") as f:
    last_id = int(f.read().strip())

user_input_path = os.path.join("..", "Data", "User_input", f"{last_id}_profile.csv")
user_input_df = pd.read_csv(user_input_path)

user_input_df.head()

Unnamed: 0,experiences,interests,qualities,python_level,sql_level,html_level,css_level,hadoop_level,cloud_level
0,"I have worked as a data scientist, performing ...","I am interested in UX/UI design, creating intu...","Creativity, empathy, attention to detail, prob...",4,3,5,5,2,2


In [114]:
# === Step 4: Cleaning the user input ===
column_cleaning = user_input_df.columns

#List of stop words
stop_words = set(stopwords.words('english'))

for col in column_cleaning:
    # To lower case
    user_input_df[col] = user_input_df[col].astype(str).str.lower()
    
    #Delete punctuation
    user_input_df[col] = user_input_df[col].apply(
        lambda x: re.sub(f"[{string.punctuation}]", " ", x)
    )
    
    #Delete stopwords
    user_input_df[col] = user_input_df[col].apply(
        lambda x: " ".join([word for word in x.split() if word not in stop_words])
    )
    
user_input_df.head()

Unnamed: 0,experiences,interests,qualities,python_level,sql_level,html_level,css_level,hadoop_level,cloud_level
0,worked data scientist performing data analysis...,interested ux ui design creating intuitive aes...,creativity empathy attention detail problem so...,4,3,5,5,2,2


In [115]:
# Transform user input in list
user_input = []

for col in column_cleaning:
    user_input.append(user_input_df[col].iloc[0])

print(user_input)

['worked data scientist performing data analysis machine learning statistical modeling python sql dashboards data visualization', 'interested ux ui design creating intuitive aesthetic interfaces prototyping wireframing usability testing figma adobe xd responsive design interaction design', 'creativity empathy attention detail problem solving analytical thinking collaboration adaptability curiosity', '4', '3', '5', '5', '2', '2']


In [116]:
# === Step 4: Load SBERT model for embeddings === 
# tester plusieurs models
model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")  # Model optimised for semantic analysis

# Helper : encode proprement un texte en 2D tensor normalisé
def encode_text(text):
    emb = model.encode(text, convert_to_tensor=True)
    emb = emb.unsqueeze(0) if emb.dim() == 1 else emb
    return emb

# Encode parties textuelles principales
xp_embeddings = encode_text(user_input[0]).mean(dim=0)          # expérience
interet_embeddings = encode_text(user_input[1]).mean(dim=0)     # intérêts
qual_embeddings = encode_text(user_input[2]).mean(dim=0)        # qualités

# Encode chaque skill individuel
skills = ["python", "sql", "html", "css", "hadoop", "cloud"]
skill_embeddings = {s: encode_text(s) for s in skills}

# Pondération des parties principales (texte)
xp_w, interet_w, qual_w = 0.2, 0.4, 0.05

# Pondération des skills techniques selon le niveau utilisateur
def skill_weight(level):
    # Niveau d'importance selon la maîtrise
    mapping = {1: 0, 2: 0, 3: 0, 4: 0.9, 5: 1.0}
    return mapping.get(level, 0.1) 


user_embeddings = xp_w * xp_embeddings + interet_w * interet_embeddings + qual_w * qual_embeddings
user_embeddings = user_embeddings + skill_weight(user_input_df["python_level"].iloc[0]) * skill_embeddings["python"]
user_embeddings = user_embeddings + skill_weight(user_input_df["sql_level"].iloc[0]) * skill_embeddings["sql"]
user_embeddings = user_embeddings + skill_weight(user_input_df["html_level"].iloc[0]) * skill_embeddings["html"]
user_embeddings = user_embeddings + skill_weight(user_input_df["css_level"].iloc[0]) * skill_embeddings["css"]
user_embeddings = user_embeddings + skill_weight(user_input_df["hadoop_level"].iloc[0]) * skill_embeddings["hadoop"]
user_embeddings = user_embeddings + skill_weight(user_input_df["cloud_level"].iloc[0]) * skill_embeddings["cloud"]

user_embeddings = torch.nn.functional.normalize(user_embeddings, p=2, dim=0)

In [117]:
# === Step 5: Calculate semantic similarity for each block === 
block_scores = {} 

for block, competencies in block_dict.items(): 
    # Encode competency block phrases 
    block_embeddings = model.encode(competencies, convert_to_tensor=True) 
     
    # Compare each user input to competencies using cosine similarity 
    similarities = util.cos_sim(user_embeddings, block_embeddings) 

     # Take max similarity per user input and average across inputs 
    max_similarities = [float(sim.max()) for sim in similarities]   
    block_score = np.mean(max_similarities)
    
    block_scores[block] = block_score


# Obtain top 3 job similarity
top_3_blocks = sorted(block_scores.items(), key=lambda x: x[1], reverse=True)

print(top_3_blocks)

[('Data Engineer', np.float64(0.5459343194961548)), ('Web Designer', np.float64(0.540837287902832)), ('Data Scientist', np.float64(0.5384271740913391)), ('UX/UI Designer', np.float64(0.5370036959648132)), ('Frontend Developer', np.float64(0.535009503364563)), ('Data Analyst', np.float64(0.5281891226768494)), ('Machine Learning Engineer', np.float64(0.5163092613220215)), ('Business Intelligence Developer', np.float64(0.5015260577201843)), ('AI Researcher', np.float64(0.4896921217441559)), ('Software Architect', np.float64(0.4813024401664734)), ('Backend Developer', np.float64(0.4750705659389496)), ('Mobile Developer', np.float64(0.4649183750152588)), ('NLP Engineer', np.float64(0.4621198773384094)), ('Computer Vision Engineer', np.float64(0.4619518518447876)), ('Cloud Engineer', np.float64(0.4614993929862976)), ('Full Stack Engineer', np.float64(0.4503398537635803)), ('Product Manager', np.float64(0.4461837410926819)), ('Database Administrator', np.float64(0.4461837410926819)), ('Cybers