In [76]:
# === Step 1: Import libraries === 
from sentence_transformers import SentenceTransformer, util 
import numpy as np
import pandas as pd
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# === Step 2: Define competency framework (blocks) ===
block_path = os.path.join("..", "Data", "Competency_block.csv")
block_df = pd.read_csv(block_path)

block_df.head()

Unnamed: 0,job,competency
0,Data Analyst,data cleaning; data visualization; business re...
1,Data Scientist,predictive modeling; ab testing; feature engin...
2,Machine Learning Engineer,classification; regression; neural networks; m...
3,Data Engineer,etl processes; data pipelines; data warehouse ...
4,NLP Engineer,tokenization; word embeddings; transformers; s...


In [78]:
# Transform in dictionnary
block_dict = block_df.set_index('job').to_dict('index')

# Séparer les compétences en liste
block_dict = {job: val['competency'].split('; ') for job, val in block_dict.items()}

In [79]:
print(block_dict)

{'Data Analyst': ['data cleaning', 'data visualization', 'business reporting', 'dashboards', 'excel', 'sql'], 'Data Scientist': ['predictive modeling', 'ab testing', 'feature engineering', 'hypothesis testing', 'statistical modeling'], 'Machine Learning Engineer': ['classification', 'regression', 'neural networks', 'model evaluation', 'deployment pipelines'], 'Data Engineer': ['etl processes', 'data pipelines', 'data warehouse design', 'big data processing', 'distributed systems'], 'NLP Engineer': ['tokenization', 'word embeddings', 'transformers', 'semantic analysis', 'information retrieval'], 'Prompt Engineer': ['prompt design', 'few shot prompting', 'zero shot prompting', 'context optimization', 'instruction tuning']}


In [80]:
# === Step 3: load the user input ===
id_path = os.path.join("..", "Data", "id.txt")
with open(id_path, "r") as f:
    last_id = int(f.read().strip())

user_input_path = os.path.join("..", "Data", "User", f"{last_id}_profile.csv")
user_input_df = pd.read_csv(user_input_path)

user_input_df.head()

Unnamed: 0,experiences,interests,python_level,sql_level
0,Etudiant ECE majeur Data & IA. Cours de machin...,"J'aimerais travailler dans les LLM, IA Agentic.",4,4


In [81]:
# === Step 4: Cleaning the user input ===
column_cleaning = user_input_df.columns

#List of stop words
stop_words = set(stopwords.words('french'))

for col in column_cleaning:
    # To lower case
    user_input_df[col] = user_input_df[col].astype(str).str.lower()
    
    #Delete punctuation
    user_input_df[col] = user_input_df[col].apply(
        lambda x: re.sub(f"[{string.punctuation}]", " ", x)
    )
    
    #Delete stopwords
    user_input_df[col] = user_input_df[col].apply(
        lambda x: " ".join([word for word in x.split() if word not in stop_words])
    )
    
user_input_df.head()

Unnamed: 0,experiences,interests,python_level,sql_level
0,etudiant ece majeur data ia cours machine lear...,aimerais travailler llm ia agentic,4,4


In [82]:
# Transform user input in list
user_input = []

for col in column_cleaning:
    user_input.append(user_input_df[col].iloc[0])
    
print(user_input)

['etudiant ece majeur data ia cours machine learning projet dataset kaggle projet data visualisation creation insight python matplotlib stage technique sujet machine learning segmentation client sujet banquaire', 'aimerais travailler llm ia agentic', '4', '4']


In [None]:
# === Step 4: Load SBERT model for embeddings === 
# tester plusieurs models
model = SentenceTransformer("all-MiniLM-L6-v2") 
 
# ajouter les poids et niveau python sql
# Encode user inputs 
user_embeddings = model.encode(user_input, convert_to_tensor=True)

In [85]:
# === Step 5: Calculate semantic similarity for each block === 
block_scores = {} 
 
for block, competencies in block_dict.items(): 
    # Encode competency block phrases 
    block_embeddings = model.encode(competencies, convert_to_tensor=True) 
     
    # Compare each user input to competencies using cosine similarity 
    similarities = util.cos_sim(user_embeddings, block_embeddings) 
     
    # Take mean similarity per user input and job 
    block_score = similarities.mean().item() 
    block_scores[block] = block_score

# Obtain top 3 job similarity
top_3_blocks = sorted(block_scores.items(), key=lambda x: x[1], reverse=True)[:3]

print(top_3_blocks)

[('Machine Learning Engineer', 0.19374284148216248), ('Data Analyst', 0.16704775393009186), ('NLP Engineer', 0.16627229750156403)]
