In [50]:
import torch
import pandas as pd

In [51]:
df_tasks = pd.read_csv('../data/data_enhanced.csv')
df_translators = pd.read_csv('../data/translators_enhanced.csv')

In [52]:
df_tasks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   31017 non-null  int64  
 1   PROJECT_ID                   31017 non-null  object 
 2   PM                           31017 non-null  object 
 3   TASK_ID                      31017 non-null  int64  
 4   START                        31017 non-null  object 
 5   END                          31017 non-null  object 
 6   TASK_TYPE                    31017 non-null  object 
 7   SOURCE_LANG                  31017 non-null  object 
 8   TARGET_LANG                  31017 non-null  object 
 9   TRANSLATOR                   31017 non-null  object 
 10  ASSIGNED                     31017 non-null  object 
 11  READY                        31017 non-null  object 
 12  WORKING                      31016 non-null  object 
 13  DELIVERED       

In [53]:
columns_drop_tasks = [
    'PROJECT_ID',
    'START',
    'END',
    'ASSIGNED',
    'READY',
    'WORKING',
    'DELIVERED',
    'RECEIVED',
    'CLOSE',
    'COST',
    'MANUFACTURER',
    'MANUFACTURER_SECTOR',
    'MANUFACTURER_INDUSTRY_GROUP',
    'MANUFACTURER_SUBINDUSTRY',
    '_work_ready',
    '_time_taken',
    '_time_reception',
    '_time_to_close',
]
df_tasks.drop(columns=columns_drop_tasks, inplace=True)
df_tasks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             31017 non-null  int64  
 1   PM                     31017 non-null  object 
 2   TASK_ID                31017 non-null  int64  
 3   TASK_TYPE              31017 non-null  object 
 4   SOURCE_LANG            31017 non-null  object 
 5   TARGET_LANG            31017 non-null  object 
 6   TRANSLATOR             31017 non-null  object 
 7   FORECAST               31017 non-null  float64
 8   HOURLY_RATE            31017 non-null  int64  
 9   QUALITY_EVALUATION     31017 non-null  int64  
 10  MANUFACTURER_INDUSTRY  31017 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 2.6+ MB


In [54]:
df_translators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TRANSLATOR               228 non-null    object 
 1   SOURCE_LANG              228 non-null    object 
 2   TARGET_LANG              228 non-null    object 
 3   HOURLY_RATE              228 non-null    float64
 4   MANUFACTURER_INDUSTRY    228 non-null    object 
 5   TASK_TYPE                228 non-null    object 
 6   PM                       228 non-null    object 
 7   FORECAST_mean            228 non-null    float64
 8   HOURLY_RATE_mean         228 non-null    float64
 9   QUALITY_EVALUATION_mean  228 non-null    float64
 10  SOURCE_LANG_EMBED        228 non-null    object 
 11  TARGET_LANG_EMBED        228 non-null    object 
 12  INDUSTRY_EMBED           228 non-null    object 
dtypes: float64(4), object(9)
memory usage: 23.3+ KB


In [55]:
positives = pd.merge(
    df_tasks,
    df_translators,
    on='TRANSLATOR',
    how='left',
    suffixes=('_task', '_translator'),
)

In [56]:
positives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 23 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        31017 non-null  int64  
 1   PM_task                           31017 non-null  object 
 2   TASK_ID                           31017 non-null  int64  
 3   TASK_TYPE_task                    31017 non-null  object 
 4   SOURCE_LANG_task                  31017 non-null  object 
 5   TARGET_LANG_task                  31017 non-null  object 
 6   TRANSLATOR                        31017 non-null  object 
 7   FORECAST                          31017 non-null  float64
 8   HOURLY_RATE_task                  31017 non-null  int64  
 9   QUALITY_EVALUATION                31017 non-null  int64  
 10  MANUFACTURER_INDUSTRY_task        31017 non-null  object 
 11  SOURCE_LANG_translator            31017 non-null  object 
 12  TARG

Some renaming of the columns

In [57]:
positives.rename(
    columns={
        'TRANSLATOR': 'TRANSLATOR_NAME',
        'FORECAST': 'FORECAST_task',
        'MANUFACTURER_INDUSTRY_task': 'INDUSTRY_task',
        'SOURCE_LANG_EMBED': 'SOURCE_LANG_EMBED_translator',
        'TARGET_LANG_EMBED': 'TARGET_LANG_EMBED_translator',
        'INDUSTRY_EMBED': 'INDUSTRY_EMBED_translator',

    },
    inplace=True,
)

### Language and Industry Embeddings

In [58]:
import sys
import os
from warnings import warn

notebooks_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebooks_dir, ".."))
sys.path.append(project_root)

from scripts.lang_embeddings import language_tokenizer
from scripts.lang_embeddings import EmbeddingLookup as LookupLang
from scripts.lang_embeddings import load_embedding_data as load_lang

from scripts.industry_embeddings import industry_tokenizer
from scripts.industry_embeddings import EmbeddingLookup as LookupIndustry
from scripts.industry_embeddings import load_embedding_data as load_industry

In [59]:
data_embed_lang = load_lang("../scripts/language_embeddings.pkl")
data_embed_industry = load_industry("../scripts/industry_embeddings.pkl")

lang_lookup = LookupLang(data_embed_lang)
industry_lookup = LookupIndustry(data_embed_industry)

Successfully loaded embedding data from '../scripts/language_embeddings.pkl'
Successfully loaded embedding data from '../scripts/industry_embeddings.pkl'
EmbeddingLookup initialized with 79 languages.
  Original embedding dimension: 384
  Latent embedding dimension: 10
EmbeddingLookup initialized with 85 languages.
  Original embedding dimension: 384
  Latent embedding dimension: 10


In [60]:
positives['SOURCE_LANG_task'] = positives['SOURCE_LANG_task'].apply(language_tokenizer).apply(lang_lookup.get_vector)
positives['TARGET_LANG_task'] = positives['TARGET_LANG_task'].apply(language_tokenizer).apply(lang_lookup.get_vector)

positives['INDUSTRY_task'] = positives['INDUSTRY_task'].apply(industry_tokenizer).apply(industry_lookup.get_vector)

In [61]:
columns_to_drop = [
    'TASK_ID',
    'QUALITY_EVALUATION',
    'SOURCE_LANG_translator',
    'TARGET_LANG_translator',
    'MANUFACTURER_INDUSTRY_translator',
]

positives.drop(columns=columns_to_drop, inplace=True)

In [62]:
positives

Unnamed: 0.1,Unnamed: 0,PM_task,TASK_TYPE_task,SOURCE_LANG_task,TARGET_LANG_task,TRANSLATOR_NAME,FORECAST_task,HOURLY_RATE_task,INDUSTRY_task,HOURLY_RATE_translator,TASK_TYPE_translator,PM_translator,FORECAST_mean,HOURLY_RATE_mean,QUALITY_EVALUATION_mean,SOURCE_LANG_EMBED_translator,TARGET_LANG_EMBED_translator,INDUSTRY_EMBED_translator
0,0,KMT,Engineering,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.061353415, -0.025645662, 0.11521641, -0.058...",Estela,0.25,24,"[0.008321971, -0.01131371, -0.012965765, -0.08...",25.745098,Engineering,BMT,0.485597,19.326389,6.950000,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...
1,1,KMT,Engineering,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Jeronimo,1.50,20,"[-0.060847882, -0.16297893, 0.06373858, -0.032...",18.000000,Engineering,KMT,1.500000,20.000000,7.000000,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.06084788 -0.16297893 0.06373858 -0.032359...
2,2,PMT,Engineering,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[0.049495008, -0.07142061, -0.005828036, -0.01...",Octavi,0.33,15,"[-0.051018845, 0.013300572, 0.061036862, -0.02...",24.729167,ProofReading,BMT,1.112889,21.599816,7.010120,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...
3,3,KMT,Management,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Ramiro Josafat,0.50,20,"[0.038529716, -0.017995264, -0.050278313, 0.03...",20.545455,ProofReading,BMT,3.870313,20.190521,6.854028,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...
4,4,PMT,Miscellaneous,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[0.049495008, -0.07142061, -0.005828036, -0.01...",Victor,0.00,11,"[0.005651269, 0.056426913, -0.016729122, -0.06...",18.000000,Engineering,BMT,0.776507,13.541485,6.973799,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31012,31012,BMT,ProofReading,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Salma,3.13,19,"[0.005651269, 0.056426913, -0.016729122, -0.06...",21.285714,ProofReading,BMT,3.157438,18.785791,6.897740,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...
31013,31013,PMT,Translation,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Fiamma Baldomero,0.59,18,"[-0.07727842, 0.002091013, 0.06725861, -0.0837...",18.600000,Translation,PMT,1.067879,17.595960,7.343434,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.01989399 0.08402714 -0.0744723 -0.027236...
31014,31014,PMT,ProofReading,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Gisela Ildefonso,0.54,15,"[-0.07727842, 0.002091013, 0.06725861, -0.0837...",13.538462,Translation,PMT,3.099965,14.045061,6.963605,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.04949501 -0.07142061 -0.00582804 -0.010367...,[-0.00962669 0.01952609 -0.02710957 -0.015226...
31015,31015,BMT,Translation,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Ramiro Josafat,0.71,20,"[0.005651269, 0.056426913, -0.016729122, -0.06...",20.545455,ProofReading,BMT,3.870313,20.190521,6.854028,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...


In [63]:
# OneHotEncoder for some columns

import pickle
from sklearn.preprocessing import OneHotEncoder

In [64]:
positives['PM_task'] = positives['PM_task'].astype('category')
positives['PM_translator'] = positives['PM_translator'].astype('category')
positives['TRANSLATOR_NAME'] = positives['TRANSLATOR_NAME'].astype('category')
positives['TASK_TYPE_task'] = positives['TASK_TYPE_task'].astype('category')
positives['TASK_TYPE_translator'] = positives['TASK_TYPE_translator'].astype('category')

In [67]:
# use the encoder for the cateogory columns

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

categorical_columns = [
    'PM_task',
    'PM_translator',
    'TASK_TYPE_task',
    'TASK_TYPE_translator',
]

encoder.fit(positives[categorical_columns].copy())

encoded_feature_names = encoder.get_feature_names_out()
encoded_columns_arrays = encoder.transform(positives[categorical_columns].copy())
encoded_df = pd.DataFrame(encoded_columns_arrays, columns=encoded_feature_names, index=positives.index)

encoded_df

Unnamed: 0,PM_task_BMT,PM_task_KMT,PM_task_PMT,PM_task_RMT,PM_translator_BMT,PM_translator_KMT,PM_translator_PMT,PM_translator_RMT,TASK_TYPE_task_DTP,TASK_TYPE_task_Engineering,TASK_TYPE_task_Management,TASK_TYPE_task_Miscellaneous,TASK_TYPE_task_ProofReading,TASK_TYPE_task_Translation,TASK_TYPE_translator_Engineering,TASK_TYPE_translator_Miscellaneous,TASK_TYPE_translator_ProofReading,TASK_TYPE_translator_Translation
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31012,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
31013,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
31014,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
31015,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
