## Import libraries

In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from natsort import natsorted

from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer, CopulaGANSynthesizer
from sdmetrics.visualization import get_column_plot
from sdv.sampling import Condition
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot

import utils

pd.set_option('display.max_columns', None)

## Data Loading

In [2]:
# File path to the Excel file
file_path = 'Dataset_2.0_Akkodis.xlsx'

# Import the dataset into a pandas DataFrame
original_data = pd.read_excel(file_path)

In [3]:
original_data.head()

Unnamed: 0,ID,Candidate State,Age Range,Residence,Sex,Protected category,TAG,Study area,Study Title,Years Experience,Sector,Last Role,Year of insertion,Year of Recruitment,Recruitment Request,Assumption Headquarters,Job Family Hiring,Job Title Hiring,event_type__val,event_feedback,linked_search__key,Overall,Job Description,Candidate Profile,Years Experience.1,Minimum Ral,Ral Maximum,Study Level,Study Area.1,Akkodis headquarters,Current Ral,Expected Ral,Technical Skills,Standing/Position,Comunication,Maturity,Dynamism,Mobility,English
0,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,Candidate notification,,,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,
1,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,BM interview,,RS18.0145,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,
2,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,Contact note,,,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,
3,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,BM interview,OK,RS18.0114,~ 2 - Medium,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,2.0,2.0,1.0,2.0,2.0,3.0,3.0
4,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,Commercial note,,,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,


In [4]:
original_data.shape

(21377, 39)

In [5]:
original_data.head()

Unnamed: 0,ID,Candidate State,Age Range,Residence,Sex,Protected category,TAG,Study area,Study Title,Years Experience,Sector,Last Role,Year of insertion,Year of Recruitment,Recruitment Request,Assumption Headquarters,Job Family Hiring,Job Title Hiring,event_type__val,event_feedback,linked_search__key,Overall,Job Description,Candidate Profile,Years Experience.1,Minimum Ral,Ral Maximum,Study Level,Study Area.1,Akkodis headquarters,Current Ral,Expected Ral,Technical Skills,Standing/Position,Comunication,Maturity,Dynamism,Mobility,English
0,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,Candidate notification,,,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,
1,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,BM interview,,RS18.0145,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,
2,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,Contact note,,,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,
3,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,BM interview,OK,RS18.0114,~ 2 - Medium,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,2.0,2.0,1.0,2.0,2.0,3.0,3.0
4,71470,Hired,31 - 35 years,TURIN » Turin ~ Piedmont,Male,,"AUTOSAR, CAN, C, C++, MATLAB/SIMULINK, VECTOR/...",Automation/Mechatronics Engineering,Five-year degree,[1-3],Automotive,Diagnostic/Test engineer,[2018],[2021],E/E Diagnostic Integration Engineer - Automotive,Milan,Engineering,Consultant,Commercial note,,,,"The candidate, inserted within a multidiscipli...",The ideal candidate has a degree in Electronic...,[1-3],26-28K,30-32K,Five-year degree,electronic Engineering,Modena,22-24 K,24-26 K,,,,,,,


In [6]:
original_data.describe(include='all')

Unnamed: 0,ID,Candidate State,Age Range,Residence,Sex,Protected category,TAG,Study area,Study Title,Years Experience,Sector,Last Role,Year of insertion,Year of Recruitment,Recruitment Request,Assumption Headquarters,Job Family Hiring,Job Title Hiring,event_type__val,event_feedback,linked_search__key,Overall,Job Description,Candidate Profile,Years Experience.1,Minimum Ral,Ral Maximum,Study Level,Study Area.1,Akkodis headquarters,Current Ral,Expected Ral,Technical Skills,Standing/Position,Comunication,Maturity,Dynamism,Mobility,English
count,21377.0,21377,21377,21374,21377,85,10647,21332,21377,21377,12214,12214,21377,2389,2095,2382,2382,2382,19787,5846,6325,5984,2118,2091,2120,1169,1528,2120,2120,2120,4156,4119,5955.0,5974.0,5968.0,5964.0,5965.0,5974.0,5944.0
unique,,7,7,2507,2,2,2245,48,7,7,14,2698,6,7,359,18,7,19,13,15,1746,8,350,350,7,15,18,7,26,18,18,18,,,,,,,
top,,Imported,26 - 30 years,TURIN » Turin ~ Piedmont,Male,Article 1,.,Mechanical engineering,Five-year degree,[0],Others,Fresh graduate,[2022],[2022],Powertrain Calibration Engineer,Turin,Engineering,Consultant,CV request,OK,RS19.0367,~ 2 - Medium,"The resource, included in a team dedicated to ...",.,[1-3],24-26K,30-32K,Five-year degree,electronic Engineering,Turin,Not available,Not available,,,,,,,
freq,,7515,10080,3491,16722,76,633,3866,12522,12376,5357,736,8295,699,47,607,1498,860,8706,3597,72,3879,34,44,652,220,250,1454,337,557,1401,1650,,,,,,,
mean,40519.752023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.177666,2.265149,2.32004,2.285714,2.29606,2.225979,2.764973
std,23615.657709,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.620178,0.573715,0.59193,0.585352,0.588163,0.810831,0.540574
min,15.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,20228.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,2.0,2.0,2.0,3.0
50%,40413.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,2.0,2.0,2.0,3.0
75%,61043.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,3.0,3.0,3.0,3.0,3.0


In [7]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21377 entries, 0 to 21376
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        21377 non-null  int64  
 1    Candidate State          21377 non-null  object 
 2    Age Range                21377 non-null  object 
 3    Residence                21374 non-null  object 
 4    Sex                      21377 non-null  object 
 5    Protected category       85 non-null     object 
 6    TAG                      10647 non-null  object 
 7    Study area               21332 non-null  object 
 8    Study Title              21377 non-null  object 
 9    Years Experience         21377 non-null  object 
 10   Sector                   12214 non-null  object 
 11   Last Role                12214 non-null  object 
 12   Year of insertion        21377 non-null  object 
 13   Year of Recruitment      2389 non-null   object 
 14   Recru

In [8]:
original_data.nunique()

ID                          12263
 Candidate State                7
 Age Range                      7
 Residence                   2507
 Sex                            2
 Protected category             2
 TAG                         2245
 Study area                    48
 Study Title                    7
 Years Experience               7
 Sector                        14
 Last Role                   2698
 Year of insertion              6
 Year of Recruitment            7
 Recruitment Request          359
 Assumption Headquarters       18
 Job Family Hiring              7
 Job Title Hiring              19
 event_type__val               13
 event_feedback                15
 linked_search__key          1746
 Overall                        8
 Job Description              350
 Candidate Profile            350
 Years Experience.1             7
 Minimum Ral                   15
 Ral Maximum                   18
 Study Level                    7
 Study Area.1                  26
 Akkodis headq

## Data cleaning

In [9]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21377 entries, 0 to 21376
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        21377 non-null  int64  
 1    Candidate State          21377 non-null  object 
 2    Age Range                21377 non-null  object 
 3    Residence                21374 non-null  object 
 4    Sex                      21377 non-null  object 
 5    Protected category       85 non-null     object 
 6    TAG                      10647 non-null  object 
 7    Study area               21332 non-null  object 
 8    Study Title              21377 non-null  object 
 9    Years Experience         21377 non-null  object 
 10   Sector                   12214 non-null  object 
 11   Last Role                12214 non-null  object 
 12   Year of insertion        21377 non-null  object 
 13   Year of Recruitment      2389 non-null   object 
 14   Recru

In [10]:
for i in sorted(original_data[' Last Role'].unique(), key=str):
    print(i)

 Corse Powertrain Division
 Drone Contest
 Engineer
 Financial Advisor
 Internship and Thesis at University
 OPERATIONS ENGINEER
 Project Manager
 Researcher
 Simulation Manager for Galileo satellites & ground Operations
 System Engineer
 researchers & Data Scientists
-
.
.NET Developer
.NET Full Stack Developer
.NET Programmer Analyst
.NET developer
.NET programmer
/
3D Mechanical Designer
3D PIPING ENGINEER OIL, GAS & REFINING SECTOR
3rd level metalworker employee 
????
ADAS Algorithm Engineer
ADAS Developer
ADAS Engineer
ADAS Passive Safety
ADAS System Resident Engineer
ADAS Validation Engineer
ADAS development internal
ADMINISTRATIVE EMPLOYEE
ADMINISTRATIVE EMPLOYEE - PURCHASING OFFICE
ADMINISTRATIVE EMPLOYEE / GRAPHIC DESIGNER
AERODYNAMICS ENGINEER
AEROSPACE FUNCTIONAL TESTER
AI Software Engineer
AI mobile software engineer
AI&C System Completion Coordinator
AIRWORTHINESS ENGINEER
AIT Consultant
AIT/AIV Engineer
AO NPO Lavockina-Secretariat and Report
ASSOCIATE PROJECT MANAGER
AUT

In [11]:
data = original_data.copy()

def organize_data(data):
    data.columns = data.columns.str.strip()
    data = data.map(lambda x: x.strip() if isinstance(x, str) else x)

    # Drop duplicate rows
    data = data.drop_duplicates()

    # Drop the tilde in the 'Overall' column
    data['Overall'] = data['Overall'].str.lstrip('~ ')

    # Convert the columns 'Year of insertion' and 'Year of Recruitment' to integers
    data['Year of insertion'] = pd.to_numeric(data['Year of insertion'].str.strip('[]'), errors='coerce').astype('Int64')
    data['Year of Recruitment'] = pd.to_numeric(data['Year of Recruitment'].str.strip('[]'), errors='coerce').astype('Int64')
    
    undesired_values = ['????', '-', '.', '/']
    data.loc[data['Last Role'].isin(undesired_values), 'Last Role'] = np.nan

    # Group the same IDs in a unique row
    def group_ids(df):
        # Count non-NaN values in each row
        df['non_nan_count'] = df.notna().sum(axis=1)
        # Keep the row with the highest non-NaN count per ID
        df = df.loc[df.groupby('ID')['non_nan_count'].idxmax()] 
        # Drop helper column
        df = df.drop(columns=['non_nan_count'])
        return df
    
    data = group_ids(data)

    # Transmform the 'Minimum Ral' and 'Maximum Ral' columns to numeric
    def convert_salary(value):
        if pd.isna(value) or value == "Not Avail.":
            return np.nan
        elif value == '- 20K' or value == '- 20 K':
            return int(20000)
        elif value == '+50K' or value == '+ 50K':
            return int(50000)
        elif '-' in value and 'K' in value:
            low, high = map(lambda x: int(x) * 1000, value.replace('K', '').split('-'))
            return int((low + high) / 2)  # Midpoint of range
        else:
            return np.nan 
    
    data['Minimum Ral'] = data['Minimum Ral'].apply(convert_salary)
    data['Ral Maximum'] = data['Ral Maximum'].apply(convert_salary)
    data['Current Ral'] = data['Current Ral'].apply(convert_salary)
    data['Expected Ral'] = data['Expected Ral'].apply(convert_salary)

    # Drop useless columns
    data = data.drop(columns=['linked_search__key'])

    return data

data = organize_data(data)

In [12]:
data.head()

Unnamed: 0,ID,Candidate State,Age Range,Residence,Sex,Protected category,TAG,Study area,Study Title,Years Experience,Sector,Last Role,Year of insertion,Year of Recruitment,Recruitment Request,Assumption Headquarters,Job Family Hiring,Job Title Hiring,event_type__val,event_feedback,Overall,Job Description,Candidate Profile,Years Experience.1,Minimum Ral,Ral Maximum,Study Level,Study Area.1,Akkodis headquarters,Current Ral,Expected Ral,Technical Skills,Standing/Position,Comunication,Maturity,Dynamism,Mobility,English
18731,15,Imported,26 - 30 years,SQUINZANO » Lecce ~ Puglia,Male,,,computer engineering,Three-year degree,[0],,,2023,,,,,,CV request,,,,,,,,,,,,,,,,,,,
18733,36,Imported,20 - 25 years,ALESSANDRIA » Alessandria ~ Piedmont,Male,,,Aeronautical/Aerospace/Astronautics Engineering,Three-year degree,[0],,,2023,,,,,,CV request,,,,,,,,,,,,,,,,,,,
17359,39,Imported,26 - 30 years,BARI » Bari ~ Puglia,Male,,,Automation/Mechatronics Engineering,Five-year degree,[0],,,2022,,,,,,CV request,,,,,,,,,,,,,,,,,,,
6642,41,First contact,31 - 35 years,PERCH DOSIMO » Cremona ~ Lombardy,Male,,,industrial engineering,Five-year degree,[1-3],Others,Business Analyst Consultant,2021,,,,,,,,,,,,,,,,,,,,,,,,,
15905,47,First contact,36 - 40 years,GERMANY » (STATE) ~ (OVERSEAS),Male,,,Electrical Engineering,Five-year degree,[+10],Energy,ET Solutions AG,2022,,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
data.shape

(12263, 38)

In [14]:
print(len(data['Recruitment Request'].unique()))
print(data['Recruitment Request'].isna().sum())
#print((data['Recruitment Request'].unique()))

354
11841


Colonne TAG e Last Role: riduce i possibili valori delle due colonne

In [15]:
print(len(data['Last Role'].unique()))
print(len(data['TAG'].unique()))

2572
2201


In [16]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from collections import Counter
 
def preprocess_text(text):
    """Rimuove caratteri speciali e converte il testo in minuscolo."""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Rimuove caratteri speciali
    return text.lower()
 
# Supponiamo che unique_last_roles e unique_tag siano liste estratte dai dati
unique_last_roles = data['Last Role'].dropna().unique().tolist()
unique_tag = data['TAG'].dropna().unique().tolist()
 
def cluster_and_map_roles(unique_values):
    processed_values = [preprocess_text(value) for value in unique_values]
 
    # Creazione della matrice TF-IDF
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(processed_values)
 
    # Clustering gerarchico
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5, linkage='ward')
    clusters = clustering.fit_predict(X.toarray())
 
    # Creazione del dizionario con i cluster
    value_clusters = {}
    for value, cluster_id in zip(unique_values, clusters):
        if cluster_id not in value_clusters:
            value_clusters[cluster_id] = []
        value_clusters[cluster_id].append(value)
 
    # Assegna un nome a ogni cluster basato sulle parole più comuni
    cluster_names = {}
    for cluster_id, values in value_clusters.items():
        words = []
        for value in values:
            words.extend(preprocess_text(value).split())
        common_words = [word for word, count in Counter(words).most_common(2)]
        cluster_names[cluster_id] = "-".join(common_words) if common_words else "Unknown"
 
    return clusters, cluster_names
 
# Clusterizzazione e assegnazione dei nomi per Last Role
clusters_last_roles, cluster_names_last_roles = cluster_and_map_roles(unique_last_roles)
clusters_tags, cluster_names_tags = cluster_and_map_roles(unique_tag)
 
def map_to_cluster_name(value, unique_values, clusters, cluster_names):
    if value in unique_values:
        cluster_id = clusters[unique_values.index(value)]
        return cluster_names.get(cluster_id, value)
    return value

In [17]:
data['Last Role'] = data['Last Role'].apply(lambda role: map_to_cluster_name(role, unique_last_roles, clusters_last_roles, cluster_names_last_roles))
data['TAG'] = data['TAG'].apply(lambda tag: map_to_cluster_name(tag, unique_tag, clusters_tags, cluster_names_tags))

In [18]:
print(len(data['Last Role'].unique()))
print(len(data['TAG'].unique()))

524
400


In [19]:
print(data.shape)

# Clean data from inconsistencies
invalid_mask = (
    ((data['Age Range'] == '< 20 years') & 
     (data['Years Experience'].isin(['[+10]', '[7-10]', '[5-7]']))) |
    ((data['Age Range'] == '20 - 25 years') & 
     (data['Years Experience'] == '[+10]'))
)

# Remove invalid rows
data = data[~invalid_mask].copy()

print(data.shape)

(12263, 38)
(11949, 38)


In [20]:
# Calculate percentage of missing values for each column
missing_percentages = (data.isna().sum() / len(data)) * 100

# Sort percentages in descending order and format output
missing_percentages_sorted = missing_percentages.sort_values(ascending=False)

# Print results
for column, percentage in missing_percentages_sorted.items():
    print(f"{column:<25} {percentage:>6.2f}%")

Protected category         99.65%
Minimum Ral                98.10%
Ral Maximum                97.52%
Candidate Profile          96.57%
Recruitment Request        96.56%
Job Description            96.54%
Akkodis headquarters       96.53%
Study Area.1               96.53%
Study Level                96.53%
Years Experience.1         96.53%
Assumption Headquarters    95.96%
Job Family Hiring          95.96%
Job Title Hiring           95.96%
Year of Recruitment        95.94%
Expected Ral               94.67%
Current Ral                93.75%
event_feedback             78.24%
English                    76.18%
Technical Skills           76.17%
Dynamism                   76.17%
Comunication               76.15%
Maturity                   76.14%
Standing/Position          76.14%
Mobility                   76.13%
Overall                    76.11%
TAG                        73.74%
Last Role                  60.00%
Sector                     59.54%
event_type__val            11.43%
Study area    

## Metadata

In [21]:
metadata = Metadata.detect_from_dataframe(data=data)

# Set column Candidate State to categorical
metadata.update_column(
    column_name='Candidate State',
    sdtype='categorical')

# Set column Last Role to categorical
metadata.update_column(
    column_name='Last Role',
    sdtype='categorical')

metadata.update_column(
    column_name='Residence',
    sdtype='categorical')

'''metadata.update_column( 
    column_name='Year of insertion',
    sdtype='numerical',
    computer_representation='Int64'
    )

metadata.update_column( 
    column_name='Year of Recruitment',
    sdtype='numerical',
    computer_representation='Int64'
    )'''

"metadata.update_column( \n    column_name='Year of insertion',\n    sdtype='numerical',\n    computer_representation='Int64'\n    )\n\nmetadata.update_column( \n    column_name='Year of Recruitment',\n    sdtype='numerical',\n    computer_representation='Int64'\n    )"

In [22]:
metadata

{
    "tables": {
        "table": {
            "columns": {
                "ID": {
                    "sdtype": "id"
                },
                "Candidate State": {
                    "sdtype": "categorical"
                },
                "Age Range": {
                    "sdtype": "categorical"
                },
                "Residence": {
                    "sdtype": "categorical"
                },
                "Sex": {
                    "sdtype": "categorical"
                },
                "Protected category": {
                    "sdtype": "categorical"
                },
                "TAG": {
                    "sdtype": "categorical"
                },
                "Study area": {
                    "sdtype": "categorical"
                },
                "Study Title": {
                    "sdtype": "categorical"
                },
                "Years Experience": {
                    "sdtype": "categorical"
                },
 

In [23]:
metadata.validate()

## Synthesizer

- generare le righe in un colpo: vae/gan
- autoregressione: generazione di colonne una alla volta bsandosi sulle precedenti

In [24]:
synthesizer = GaussianCopulaSynthesizer(
    metadata,
    locales='it_IT',
    default_distribution='beta',
#    verbose=True
)

synthesizer.auto_assign_transformers(data)



## Add Constraints

In [25]:
# load the constraint from the file
synthesizer.load_custom_constraint_classes(
    filepath='custom_constraint_years.py',
    class_names=['CustomYearsHired'] #, 'CustomAgeExperience']
)

YearsHired_constraint = {
    'constraint_class': 'CustomYearsHired',
    'constraint_parameters': {
        'column_names': ['Candidate State', 'Year of insertion', 'Year of Recruitment']
    }
}

'''AgeExperience_constraint = {
    'constraint_class': 'CustomAgeExperience',
    'constraint_parameters': {
        'column_names': ['Age Range', 'Years Experience']
    }
}'''

"AgeExperience_constraint = {\n    'constraint_class': 'CustomAgeExperience',\n    'constraint_parameters': {\n        'column_names': ['Age Range', 'Years Experience']\n    }\n}"

In [26]:
experience_age_constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['Age Range', 'Years Experience']
    }
}

recruitment_constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['Candidate State', 'Year of Recruitment']
    }
}

MinMaxRal_constraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'Minimum Ral',
        'high_column_name': 'Ral Maximum',
        'strict_boundaries': False
    }
}

synthesizer.add_constraints(constraints=[
    experience_age_constraint,
    recruitment_constraint,
    YearsHired_constraint,
    #AgeExperience_constraint,
    MinMaxRal_constraint,
])

## Fit synthesither and Sample data

In [27]:
synthesizer.fit(data)

synthetic_data = synthesizer.sample(num_rows=1000)

Sampling rows: 100%|██████████| 1000/1000 [00:00<00:00, 4110.59it/s]


In [28]:
'''synthesizer = CTGANSynthesizer(
    metadata,
    locales='it_IT',
    epochs=5
#    verbose=True
)

synthesizer.auto_assign_transformers(data)

synthesizer.fit(data)

synthetic_data = synthesizer.sample(num_rows=1000)'''

"synthesizer = CTGANSynthesizer(\n    metadata,\n    locales='it_IT',\n    epochs=5\n#    verbose=True\n)\n\nsynthesizer.auto_assign_transformers(data)\n\nsynthesizer.fit(data)\n\nsynthetic_data = synthesizer.sample(num_rows=1000)"

In [29]:
synthesizer.get_transformers()


{'ID': AnonymizedFaker(function_name='random_int', function_kwargs={'min': 0, 'max': 16777216}, cardinality_rule='unique'),
 'Residence': UniformEncoder(),
 'Sex': UniformEncoder(),
 'Protected category': UniformEncoder(),
 'TAG': UniformEncoder(),
 'Study area': UniformEncoder(),
 'Study Title': UniformEncoder(),
 'Sector': UniformEncoder(),
 'Last Role': UniformEncoder(),
 'Year of insertion': UniformEncoder(),
 'Recruitment Request': UniformEncoder(),
 'Assumption Headquarters': UniformEncoder(),
 'Job Family Hiring': UniformEncoder(),
 'Job Title Hiring': UniformEncoder(),
 'event_type__val': UniformEncoder(),
 'event_feedback': UniformEncoder(),
 'Overall': UniformEncoder(),
 'Job Description': UniformEncoder(),
 'Candidate Profile': UniformEncoder(),
 'Years Experience.1': UniformEncoder(),
 'Minimum Ral': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'Study Level': UniformEncoder(),
 'Study Area.1': UniformEncoder(),
 'Akkodis headquarters': UniformEn

## Evaluate generation

In [30]:
# 1. perform basic validity checks
diagnostic_report = run_diagnostic(data, synthetic_data, metadata)



data_val = diagnostic_report.get_properties()
data_dict = data_val.set_index('Property')['Score'].to_dict()
print(data_val)
print(data_dict)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 38/38 [00:00<00:00, 1655.06it/s]|
Data Validity Score: 97.44%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 769.17it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 98.72%

         Property     Score
0   Data Validity  0.974421
1  Data Structure  1.000000
{'Data Validity': 0.974421052631579, 'Data Structure': 1.0}


In [31]:
# 2. measure the statistical similarity
quality_report = evaluate_quality(data, synthetic_data, metadata)


data_qual = quality_report.get_properties()

print(data_qual)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 38/38 [00:00<00:00, 726.11it/s]|
Column Shapes Score: 78.16%

(2/2) Evaluating Column Pair Trends: |██████████| 703/703 [00:01<00:00, 356.17it/s]|
Column Pair Trends Score: 88.23%

Overall Score (Average): 83.19%

             Property     Score
0       Column Shapes  0.781555
1  Column Pair Trends  0.882286


In [32]:
quality_report.get_details(property_name='Column Pair Trends').head(20)

Unnamed: 0,Column 1,Column 2,Metric,Score,Real Correlation,Synthetic Correlation,Error
0,Candidate State,Age Range,ContingencySimilarity,0.826534,,,
1,Candidate State,Residence,ContingencySimilarity,0.398834,,,
2,Candidate State,Sex,ContingencySimilarity,0.937391,,,
3,Candidate State,Protected category,ContingencySimilarity,0.953243,,,
4,Candidate State,TAG,ContingencySimilarity,0.746991,,,
5,Candidate State,Study area,ContingencySimilarity,0.785742,,,
6,Candidate State,Study Title,ContingencySimilarity,0.874811,,,
7,Candidate State,Years Experience,ContingencySimilarity,0.835682,,,
8,Candidate State,Sector,ContingencySimilarity,0.749588,,,
9,Candidate State,Last Role,ContingencySimilarity,0.626214,,,


## Result visualization

In [33]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_names=['Year of insertion', 'Year of Recruitment'],
    #column_names=['Candidate State', 'Year of Recruitment'],
    plot_type='box'
    )
    
fig.show()

fig = get_column_pair_plot(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_names=['Age Range', 'Years Experience'],
    plot_type='box'
    )
    
fig.show()

In [34]:
# 3. plot the data
columns_to_plot = ['Minimum Ral', 'Ral Maximum', 'Current Ral', 'Expected Ral', 'Year of insertion', 'Year of Recruitment', 
                   'Sex', 'Candidate State', 'Age Range', 'Years Experience', 'Technical Skills', 'Standing/Position', 
                   'Comunication', 'Maturity', 'Dynamism', 'Mobility', 'English']
for col in columns_to_plot:
    fig = get_column_plot(
        real_data=data,
        synthetic_data=synthetic_data,
        metadata=metadata,
        column_name=col,
        plot_type='bar'
    )
        
    fig.show()

In [35]:
'''def plot_current_ral_distribution(data, sex):
    """
    Filters the DataFrame for rows where Sex matches the input sex and plots the distribution
    of the 'Current Ral' column, excluding 'Not available' values.
    
    Parameters:
    data (pd.DataFrame): The input DataFrame containing at least the columns 'Sex' and 'Current Ral'.
    sex (str): The sex category to filter for ('Male' or 'Female')
    """
    

    ral_col = 'Expected Ral'

    
    # Print the number of instances with given sex
    print(f"Number of instances with Sex = {sex}: {len(data[data['Sex'] == sex])}")

    # Filter the DataFrame for specified sex and exclude 'Not available'
    filtered_data = data[(data['Sex'] == sex) & (data[ral_col] != 'Not available')]

    # Count and print 'Not available' instances
    not_available_count = len(data[(data['Sex'] == sex) & (data[ral_col] == 'Not available')])
    print(f"Number of 'Not available' instances for {sex}: {not_available_count} ({not_available_count / len(filtered_data) * 100:.2f}%)")

    # Convert range strings like '32-34k' into a numeric lower bound and sort the filtered data
    filtered_data = filtered_data.copy()
    filtered_data['Current_Ral_numeric'] = filtered_data[ral_col].str.extract(r'(\d+)-\d+k', expand=False).astype(float)
    filtered_data = filtered_data.sort_values('Current_Ral_numeric')

    def custom_sort_key(s):
        if isinstance(s, str):
            if s.strip()[0] == '-':
                return (-1, s)
        return (1, s)

    sorted_categories = natsorted(filtered_data[ral_col].dropna().unique(), key=custom_sort_key)
    plt.figure(figsize=(15, 6))
    sns.countplot(data=filtered_data, x=ral_col, order=sorted_categories, palette='pastel', hue=ral_col, legend=False)
    
    plt.xlabel("Current Ral Categories")

    plt.title(f"Distribution of {ral_col} for {sex}s")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.show()

# Example usage:
# Assuming 'synthetic_data' is your DataFrame:
plot_current_ral_distribution(data, 'Male')
plot_current_ral_distribution(data, 'Female')
plot_current_ral_distribution(synthetic_data, 'Male')
plot_current_ral_distribution(synthetic_data, 'Female')'''

'def plot_current_ral_distribution(data, sex):\n    """\n    Filters the DataFrame for rows where Sex matches the input sex and plots the distribution\n    of the \'Current Ral\' column, excluding \'Not available\' values.\n    \n    Parameters:\n    data (pd.DataFrame): The input DataFrame containing at least the columns \'Sex\' and \'Current Ral\'.\n    sex (str): The sex category to filter for (\'Male\' or \'Female\')\n    """\n    \n\n    ral_col = \'Expected Ral\'\n\n    \n    # Print the number of instances with given sex\n    print(f"Number of instances with Sex = {sex}: {len(data[data[\'Sex\'] == sex])}")\n\n    # Filter the DataFrame for specified sex and exclude \'Not available\'\n    filtered_data = data[(data[\'Sex\'] == sex) & (data[ral_col] != \'Not available\')]\n\n    # Count and print \'Not available\' instances\n    not_available_count = len(data[(data[\'Sex\'] == sex) & (data[ral_col] == \'Not available\')])\n    print(f"Number of \'Not available\' instances for {sex

## Polarization

In [36]:
polarization_list = [
        [{"Field": "Sex", "Value": "Female", "Percentage": 25}],
        [{"Field": "Candidate State", "Value": "Hired", "Percentage": 25}],
    ]

final_data, polarized_synthetic_data, remaining_synthetic_data = utils.generate_polarized_data(synthesizer, polarization_list, 1000)

Sampling rows: 100%|██████████| 100000/100000 [00:05<00:00, 18814.92it/s]

Dimensione iniziale del dataset sintetico: 100000
Selezioniamo 250 righe per le condizioni: {'Sex': 'Female'}
Righe disponibili dopo il filtraggio: 22740
Selezioniamo 250 righe per le condizioni: {'Candidate State': 'Hired'}
Righe disponibili dopo il filtraggio: 1873
Dimensione del dataset rimanente dopo tutte le esclusioni: 74867





In [37]:
print(final_data.shape)
percentage_female = (final_data['Sex'] == 'Female').mean() * 100
print(f"Percentage of rows with 'Sex' == 'Female': {percentage_female:.2f}%")
percentage_hired = (final_data['Candidate State'] == 'Hired').mean() * 100
print(f"Percentage of rows with 'Candidate State' == 'Hired': {percentage_hired:.2f}%")
percentage_hired_female = ((final_data['Candidate State'] == 'Hired') & (final_data['Sex'] == 'Female')).mean() * 100
print(f"Percentage of rows with both 'Candidate State' == 'Hired' and 'Sex' == 'Female': {percentage_hired_female:.2f}%")

(1000, 38)
Percentage of rows with 'Sex' == 'Female': 25.00%
Percentage of rows with 'Candidate State' == 'Hired': 25.00%
Percentage of rows with both 'Candidate State' == 'Hired' and 'Sex' == 'Female': 0.00%


In [38]:
polarization_list = [
        [{"Field": "Sex", "Value": "Female", "Percentage": 25},
         {"Field": "Candidate State", "Value": "Hired", "Percentage": 25}],
        [{"Field": "Study Title", "Value": "Five-year degree", "Percentage": 10},
         {"Field": "Assumption Headquarters", "Value": "Milan", "Percentage": 10}]
    ]

final_data, polarized_synthetic_data, remaining_synthetic_data = utils.generate_polarized_data2(synthesizer, polarization_list, 1000)

Tentativo 1/3: Generazione di 100000 righe...


Sampling rows: 100%|██████████| 100000/100000 [00:05<00:00, 16956.77it/s]


Dimensione dataset generato: 100000 righe
Condizioni: {'Sex': 'Female', 'Candidate State': 'Hired'} - Righe richieste: 250 - Disponibili: 192

[33mErrore: Dati insufficienti per {'Sex': 'Female', 'Candidate State': 'Hired'}. Disponibili: 192, richiesti: 250.
Nuovo tentativo (2/3) con più dati...[0m
Tentativo 2/3: Generazione di 200000 righe...


Sampling rows: 100%|██████████| 200000/200000 [00:11<00:00, 17379.15it/s]


Dimensione dataset generato: 200000 righe
Condizioni: {'Sex': 'Female', 'Candidate State': 'Hired'} - Righe richieste: 250 - Disponibili: 413
Condizioni: {'Study Title': 'Five-year degree', 'Assumption Headquarters': 'Milan'} - Righe richieste: 100 - Disponibili: 851
Dataset rimanente: 69345 righe
Dataset finale generato: 1000 righe (attese: 1000)



In [40]:
print(final_data.shape)
percentage_female = (final_data['Sex'] == 'Female').mean() * 100
print(f"Percentage of rows with 'Sex' == 'Female': {percentage_female:.2f}%")
percentage_hired = (final_data['Candidate State'] == 'Hired').mean() * 100
print(f"Percentage of rows with 'Candidate State' == 'Hired': {percentage_hired:.2f}%")
percentage_hired_female = ((final_data['Candidate State'] == 'Hired') & (final_data['Sex'] == 'Female')).mean() * 100
print(f"Percentage of rows with both 'Candidate State' == 'Hired' and 'Sex' == 'Female': {percentage_hired_female:.2f}%")

(1000, 38)
Percentage of rows with 'Sex' == 'Female': 25.00%
Percentage of rows with 'Candidate State' == 'Hired': 25.00%
Percentage of rows with both 'Candidate State' == 'Hired' and 'Sex' == 'Female': 25.00%
