In [1]:
import sys
import os
from warnings import warn

notebooks_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebooks_dir, ".."))
sys.path.append(project_root)

from scripts.lang_embeddings import language_tokenizer

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [2]:
import pickle
import pandas as pd
import numpy as np

df = pd.read_csv("../data/translators.csv")

In [3]:
collapsed_df = (
    df.groupby("TRANSLATOR")
    .agg(
        SOURCE_LANG=("SOURCE_LANG", lambda x: language_tokenizer(x.mode()[0])),
        TARGET_LANG=("TARGET_LANG", lambda x: language_tokenizer(x.mode()[0])),
        HOURLY_RATE=("HOURLY_RATE", "mean"),
    )
    .reset_index()
)

# Display the resulting DataFrame
collapsed_df

Unnamed: 0,TRANSLATOR,SOURCE_LANG,TARGET_LANG,HOURLY_RATE
0,Aaron,English,Basque,20.700000
1,Abdon,English,Spanish_LA,18.500000
2,Abdon Isaias,English,Spanish_Iberian,16.375000
3,Abdon Luis,English,Spanish_Iberian,16.000000
4,Abel Irene,English,Spanish_Iberian,17.000000
...,...,...,...,...
978,Zacarias Casio,English,Spanish_Argentina,18.181818
979,Zacarias Marcelino,English,Spanish_Global,20.600000
980,Zachary,Catalan,English,22.000000
981,Zlatan,English,Swedish,40.000000


In [4]:
def get_embedding(lang):
    idx = lang_to_index.get(lang)
    if idx is not None:
        return embeddings[idx]
    else:
        return np.zeros(embeddings.shape[1])

In [5]:
def add_most_frequent_category(df: pd.DataFrame, target_df: pd.DataFrame, group_col: str, category_cols: list) -> pd.DataFrame:
    merged_df = target_df.copy()

    for col in category_cols:
        if col not in df.columns:
            warn(f"Warning: Column '{col}' not found in the input DataFrame. Skipping.")
            continue

        count_series = df.groupby([group_col, col]).size().reset_index(name="count")
        idx_max_count = count_series.groupby(group_col)["count"].idxmax()

        most_frequent_categories = count_series.loc[idx_max_count]
        most_frequent_categories = most_frequent_categories[[group_col, col]]

        merged_df = pd.merge(
            merged_df,
            most_frequent_categories,
            on=group_col,
            how="left" # Use left merge to keep all rows from target_df
        )
        print(f"Merged most frequent '{col}' into target DataFrame.")


    return merged_df


def add_mean_numerical_value(df: pd.DataFrame, target_df: pd.DataFrame, group_col: str, numerical_cols: list) -> pd.DataFrame:
    merged_df = target_df.copy()

    for col in numerical_cols:
        if col not in df.columns:
            warn(f"Warning: Column '{col}' not found in the input DataFrame. Skipping.")
            continue

        if not pd.api.types.is_numeric_dtype(df[col]):
             warn(f"Warning: Column '{col}' is not of numeric dtype. Skipping mean calculation.")
             continue

        mean_values = df.groupby(group_col)[col].mean().reset_index()

        new_col_name = f'{col}_mean'
        mean_values.rename(columns={col: new_col_name}, inplace=True)

        merged_df = pd.merge(
            merged_df,
            mean_values,
            on=group_col,
            how="left",
        )
        print(f"Merged mean of '{col}' into target DataFrame as '{new_col_name}'.")

    return merged_df

In [6]:
df_task = pd.read_csv("../data/final_data_enhanced.csv")
df_task

  df_task = pd.read_csv("../data/final_data_enhanced.csv")


Unnamed: 0.1,Unnamed: 0,PROJECT_ID,PM,TASK_ID,START,END,TASK_TYPE,SOURCE_LANG,TARGET_LANG,TRANSLATOR,...,QUALITY_EVALUATION,MANUFACTURER,MANUFACTURER_SECTOR,MANUFACTURER_INDUSTRY_GROUP,MANUFACTURER_INDUSTRY,MANUFACTURER_SUBINDUSTRY,_work_ready,_time_taken,_time_reception,_time_to_close
0,0,212331,PMT,10048285,2013-11-01 13:13:00,2013-10-31 19:00:00,Miscellaneous,Catalan,Catalan,Victor,...,5,SunTech,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals",0.000556,0.000556,0.000556,0.000833
1,1,211096,PMT,10048285,2012-10-26 17:24:00,2012-11-30 19:00:00,Miscellaneous,English,Galician,Severino,...,8,NexisOne,Technology Hardware,Technology,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals",92.501667,743.030556,0.676389,0.000000
2,2,214198,KMT,10048285,2015-09-09 17:29:00,2015-09-10 11:00:00,Engineering,English,Spanish (Iberian),Jeronimo,...,9,HealthyLife,Health Care,Health Care Providers,Health Care Facilities,Long-Term Care Facilities,0.014444,23.109167,0.845556,0.000556
3,3,213494,KMT,10048285,2014-11-26 10:36:00,2014-11-26 15:30:00,Engineering,English,Portuguese (Brazil),Estela,...,7,Coastal Cottage,Consumer Discretionary,Consumer Services,"Hotels, Restaurants & Leisure","Hotels, Resorts & Cruise Lines",0.000556,1.261111,0.027500,0.000556
4,4,212331,PMT,10048286,2013-11-01 13:13:00,2013-10-31 19:00:00,Management,Catalan,Spanish (Global),Maria Alexandra,...,7,SunTech,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals",0.000556,0.001389,0.000833,0.000556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554024,554024,220752,KMT,11240388,2022-11-10 14:48:00,2022-11-10 13:46:00,ProofReading,English,Spanish (Global),Ascension,...,6,Polytech,Materials,Chemicals,Specialty Chemicals,Specialty Chemicals,0.166389,0.164722,0.162500,0.075000
554025,554025,220752,KMT,11240389,2022-11-10 13:44:00,2022-11-10 13:41:00,ProofReading,English,Spanish (Global),Ascension,...,6,Polytech,Materials,Chemicals,Specialty Chemicals,Specialty Chemicals,0.148611,0.146944,0.011389,0.073611
554026,554026,220752,KMT,11240391,2022-11-16 16:55:00,2022-11-16 15:52:00,ProofReading,English,Spanish (Global),Ascension,...,9,Polytech,Materials,Chemicals,Specialty Chemicals,Specialty Chemicals,0.184722,0.200000,0.381111,0.093333
554027,554027,220752,KMT,11240392,2022-11-22 10:35:00,2022-11-22 10:32:00,ProofReading,English,Spanish (Global),Ascension,...,8,Polytech,Materials,Chemicals,Specialty Chemicals,Specialty Chemicals,0.096111,0.111111,1.742222,0.054722


In [7]:
category_columns = [
    "MANUFACTURER_INDUSTRY",
    "TASK_TYPE",
    "PM",
    ]
translators_df = add_most_frequent_category(
    df=df_task,
    target_df=collapsed_df,
    group_col="TRANSLATOR",
    category_cols=category_columns
)

numerical_columns = [
    "HOURS",
    "HOURLY_RATE",
    "QUALITY_EVALUATION",
]

translators_df = add_mean_numerical_value(
    df=df_task,
    target_df=translators_df,
    group_col="TRANSLATOR",
    numerical_cols=numerical_columns,
)

Merged most frequent 'MANUFACTURER_INDUSTRY' into target DataFrame.
Merged most frequent 'TASK_TYPE' into target DataFrame.
Merged most frequent 'PM' into target DataFrame.
Merged mean of 'HOURS' into target DataFrame as 'HOURS_mean'.
Merged mean of 'HOURLY_RATE' into target DataFrame as 'HOURLY_RATE_mean'.
Merged mean of 'QUALITY_EVALUATION' into target DataFrame as 'QUALITY_EVALUATION_mean'.


In [8]:
# drop the rows that have missing values on MANUFACTURER_INDUSTRY and TASK_TYPE
translators_df = translators_df[
    ~translators_df["MANUFACTURER_INDUSTRY"].isnull()
]

renamed = {
    'HOURLY_RATE_x': 'HOURLY_RATE',
    'HOURLY_RATE_y': 'HOURLY_RATE_AVG_TASK',
}

translators_df.rename(columns=renamed, inplace=True)

translators_df

Unnamed: 0,TRANSLATOR,SOURCE_LANG,TARGET_LANG,HOURLY_RATE,MANUFACTURER_INDUSTRY,TASK_TYPE,PM,HOURS_mean,HOURLY_RATE_mean,QUALITY_EVALUATION_mean
0,Aaron,English,Basque,20.700000,Health Care Facilities,DTP,BMT,0.000000,15.000000,5.500000
1,Abdon,English,Spanish_LA,18.500000,Internet & Direct Marketing Retail,PostEditing,PMT,8.651852,14.407407,7.222222
2,Abdon Isaias,English,Spanish_Iberian,16.375000,Automobiles,ProofReading,PMT,3.557578,16.979836,7.007694
3,Abdon Luis,English,Spanish_Iberian,16.000000,IT Services,PostEditing,PMT,5.680000,17.000000,7.000000
4,Abel Irene,English,Spanish_Iberian,17.000000,Application Software,TEST,BMT,0.000000,12.000000,6.500000
...,...,...,...,...,...,...,...,...,...,...
978,Zacarias Casio,English,Spanish_Argentina,18.181818,Automobiles,Translation,PMT,13.514974,12.936508,7.132275
979,Zacarias Marcelino,English,Spanish_Global,20.600000,Health Care Facilities,Translation,PMT,13.898125,12.343750,6.781250
980,Zachary,Catalan,English,22.000000,Internet & Direct Marketing Retail,PostEditing,BMT,12.298667,27.533333,6.800000
981,Zlatan,English,Swedish,40.000000,Industrial Machinery,Translation,KMT,3.264545,44.000000,6.818182


By looking at the DF, we can verify that there are no missing values on the columns.

### Embed Language

From our pre-made embeddings

In [10]:
# the languages are already in our desided format

from scripts.lang_embeddings import EmbeddingLookup, load_embedding_data

embedding_data = load_embedding_data("../scripts/language_embeddings.pkl")

lookup = EmbeddingLookup(loaded_data=embedding_data)

Successfully loaded embedding data from '../scripts/language_embeddings.pkl'
EmbeddingLookup initialized with 79 languages.
  Original embedding dimension: 384
  Latent embedding dimension: 10


In [11]:
source_lang_embed = translators_df['SOURCE_LANG'].apply(lookup.get_vector)
target_lang_embed = translators_df['TARGET_LANG'].apply(lookup.get_vector)

translators_df['SOURCE_LANG_EMBED'] = source_lang_embed
translators_df['TARGET_LANG_EMBED'] = target_lang_embed
translators_df

Unnamed: 0,TRANSLATOR,SOURCE_LANG,TARGET_LANG,HOURLY_RATE,MANUFACTURER_INDUSTRY,TASK_TYPE,PM,HOURS_mean,HOURLY_RATE_mean,QUALITY_EVALUATION_mean,SOURCE_LANG_EMBED,TARGET_LANG_EMBED
0,Aaron,English,Basque,20.700000,Health Care Facilities,DTP,BMT,0.000000,15.000000,5.500000,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.033526402, -0.052197512, 0.015763432, -0.01..."
1,Abdon,English,Spanish_LA,18.500000,Internet & Direct Marketing Retail,PostEditing,PMT,8.651852,14.407407,7.222222,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.035857968, -0.07145234, 0.13904646, -0.0420..."
2,Abdon Isaias,English,Spanish_Iberian,16.375000,Automobiles,ProofReading,PMT,3.557578,16.979836,7.007694,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899..."
3,Abdon Luis,English,Spanish_Iberian,16.000000,IT Services,PostEditing,PMT,5.680000,17.000000,7.000000,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899..."
4,Abel Irene,English,Spanish_Iberian,17.000000,Application Software,TEST,BMT,0.000000,12.000000,6.500000,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899..."
...,...,...,...,...,...,...,...,...,...,...,...,...
978,Zacarias Casio,English,Spanish_Argentina,18.181818,Automobiles,Translation,PMT,13.514974,12.936508,7.132275,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.086066775, -0.054202735, 0.1476611, -0.0226..."
979,Zacarias Marcelino,English,Spanish_Global,20.600000,Health Care Facilities,Translation,PMT,13.898125,12.343750,6.781250,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.05821279, -0.058210686, 0.13881628, -0.0406..."
980,Zachary,Catalan,English,22.000000,Internet & Direct Marketing Retail,PostEditing,BMT,12.298667,27.533333,6.800000,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[-0.032043114, -0.05257272, 0.04719335, -0.044..."
981,Zlatan,English,Swedish,40.000000,Industrial Machinery,Translation,KMT,3.264545,44.000000,6.818182,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[-0.05466275, 0.006728865, 0.02273543, -0.0235..."


### Embed Industry

From our pre-made embeddings

In [12]:
from scripts.industry_embeddings import EmbeddingLookup, load_embedding_data, industry_tokenizer
embedding_data = load_embedding_data("../scripts/industry_embeddings.pkl")

lookup = EmbeddingLookup(loaded_data=embedding_data)

Successfully loaded embedding data from '../scripts/industry_embeddings.pkl'
EmbeddingLookup initialized with 85 languages.
  Original embedding dimension: 384
  Latent embedding dimension: 10


In [13]:
industry_embed = translators_df['MANUFACTURER_INDUSTRY'].apply(industry_tokenizer).apply(lookup.get_vector)
translators_df['INDUSTRY_EMBED'] = industry_embed
translators_df



Unnamed: 0,TRANSLATOR,SOURCE_LANG,TARGET_LANG,HOURLY_RATE,MANUFACTURER_INDUSTRY,TASK_TYPE,PM,HOURS_mean,HOURLY_RATE_mean,QUALITY_EVALUATION_mean,SOURCE_LANG_EMBED,TARGET_LANG_EMBED,INDUSTRY_EMBED
0,Aaron,English,Basque,20.700000,Health Care Facilities,DTP,BMT,0.000000,15.000000,5.500000,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.033526402, -0.052197512, 0.015763432, -0.01...","[-0.060847882, -0.16297893, 0.06373858, -0.032..."
1,Abdon,English,Spanish_LA,18.500000,Internet & Direct Marketing Retail,PostEditing,PMT,8.651852,14.407407,7.222222,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.035857968, -0.07145234, 0.13904646, -0.0420...","[-0.089615226, 0.12559475, -0.039703473, -0.06..."
2,Abdon Isaias,English,Spanish_Iberian,16.375000,Automobiles,ProofReading,PMT,3.557578,16.979836,7.007694,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...","[0.080458455, -0.015819076, -0.0074550044, -0...."
3,Abdon Luis,English,Spanish_Iberian,16.000000,IT Services,PostEditing,PMT,5.680000,17.000000,7.000000,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...","[-0.09284334, -0.024717085, 0.09458317, -0.079..."
4,Abel Irene,English,Spanish_Iberian,17.000000,Application Software,TEST,BMT,0.000000,12.000000,6.500000,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...","[-0.051018845, 0.013300572, 0.061036862, -0.02..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,Zacarias Casio,English,Spanish_Argentina,18.181818,Automobiles,Translation,PMT,13.514974,12.936508,7.132275,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.086066775, -0.054202735, 0.1476611, -0.0226...","[0.080458455, -0.015819076, -0.0074550044, -0...."
979,Zacarias Marcelino,English,Spanish_Global,20.600000,Health Care Facilities,Translation,PMT,13.898125,12.343750,6.781250,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.05821279, -0.058210686, 0.13881628, -0.0406...","[-0.060847882, -0.16297893, 0.06373858, -0.032..."
980,Zachary,Catalan,English,22.000000,Internet & Direct Marketing Retail,PostEditing,BMT,12.298667,27.533333,6.800000,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[-0.032043114, -0.05257272, 0.04719335, -0.044...","[-0.089615226, 0.12559475, -0.039703473, -0.06..."
981,Zlatan,English,Swedish,40.000000,Industrial Machinery,Translation,KMT,3.264545,44.000000,6.818182,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[-0.05466275, 0.006728865, 0.02273543, -0.0235...","[0.022946361, -0.040781002, -0.0532133, 0.0600..."


### Save the dataframe

In [14]:
translators_df.to_pickle(
    "../data/final_translators_enhanced.pkl",
)