In [1]:
import torch
import pandas as pd

In [2]:
df_tasks = pd.read_csv('../data/data_enhanced.csv')
df_translators = pd.read_csv('../data/translators_enhanced.csv')

In [3]:
df_tasks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   31017 non-null  int64  
 1   PROJECT_ID                   31017 non-null  object 
 2   PM                           31017 non-null  object 
 3   TASK_ID                      31017 non-null  int64  
 4   START                        31017 non-null  object 
 5   END                          31017 non-null  object 
 6   TASK_TYPE                    31017 non-null  object 
 7   SOURCE_LANG                  31017 non-null  object 
 8   TARGET_LANG                  31017 non-null  object 
 9   TRANSLATOR                   31017 non-null  object 
 10  ASSIGNED                     31017 non-null  object 
 11  READY                        31017 non-null  object 
 12  WORKING                      31016 non-null  object 
 13  DELIVERED       

In [4]:
columns_drop_tasks = [
    'PROJECT_ID',
    'START',
    'END',
    'ASSIGNED',
    'READY',
    'WORKING',
    'DELIVERED',
    'RECEIVED',
    'CLOSE',
    'COST',
    'MANUFACTURER',
    'MANUFACTURER_SECTOR',
    'MANUFACTURER_INDUSTRY_GROUP',
    'MANUFACTURER_SUBINDUSTRY',
    '_work_ready',
    '_time_taken',
    '_time_reception',
    '_time_to_close',
]
df_tasks.drop(columns=columns_drop_tasks, inplace=True)
df_tasks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             31017 non-null  int64  
 1   PM                     31017 non-null  object 
 2   TASK_ID                31017 non-null  int64  
 3   TASK_TYPE              31017 non-null  object 
 4   SOURCE_LANG            31017 non-null  object 
 5   TARGET_LANG            31017 non-null  object 
 6   TRANSLATOR             31017 non-null  object 
 7   FORECAST               31017 non-null  float64
 8   HOURLY_RATE            31017 non-null  int64  
 9   QUALITY_EVALUATION     31017 non-null  int64  
 10  MANUFACTURER_INDUSTRY  31017 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 2.6+ MB


In [5]:
df_translators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TRANSLATOR               228 non-null    object 
 1   SOURCE_LANG              228 non-null    object 
 2   TARGET_LANG              228 non-null    object 
 3   HOURLY_RATE              228 non-null    float64
 4   MANUFACTURER_INDUSTRY    228 non-null    object 
 5   TASK_TYPE                228 non-null    object 
 6   PM                       228 non-null    object 
 7   FORECAST_mean            228 non-null    float64
 8   HOURLY_RATE_mean         228 non-null    float64
 9   QUALITY_EVALUATION_mean  228 non-null    float64
 10  SOURCE_LANG_EMBED        228 non-null    object 
 11  TARGET_LANG_EMBED        228 non-null    object 
 12  INDUSTRY_EMBED           228 non-null    object 
dtypes: float64(4), object(9)
memory usage: 23.3+ KB


In [6]:
positives = pd.merge(
    df_tasks,
    df_translators,
    on='TRANSLATOR',
    how='left',
    suffixes=('_task', '_translator'),
)

In [7]:
positives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 23 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        31017 non-null  int64  
 1   PM_task                           31017 non-null  object 
 2   TASK_ID                           31017 non-null  int64  
 3   TASK_TYPE_task                    31017 non-null  object 
 4   SOURCE_LANG_task                  31017 non-null  object 
 5   TARGET_LANG_task                  31017 non-null  object 
 6   TRANSLATOR                        31017 non-null  object 
 7   FORECAST                          31017 non-null  float64
 8   HOURLY_RATE_task                  31017 non-null  int64  
 9   QUALITY_EVALUATION                31017 non-null  int64  
 10  MANUFACTURER_INDUSTRY_task        31017 non-null  object 
 11  SOURCE_LANG_translator            31017 non-null  object 
 12  TARG

Some renaming of the columns

In [8]:
positives.rename(
    columns={
        'TRANSLATOR': 'TRANSLATOR_NAME',
        'FORECAST': 'FORECAST_task',
        'MANUFACTURER_INDUSTRY_task': 'INDUSTRY_task',
        'SOURCE_LANG_EMBED': 'SOURCE_LANG_EMBED_translator',
        'TARGET_LANG_EMBED': 'TARGET_LANG_EMBED_translator',
        'INDUSTRY_EMBED': 'INDUSTRY_EMBED_translator',

    },
    inplace=True,
)

### Language and Industry Embeddings

In [9]:
import sys
import os
from warnings import warn

notebooks_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebooks_dir, ".."))
sys.path.append(project_root)

from scripts.lang_embeddings import language_tokenizer
from scripts.lang_embeddings import EmbeddingLookup as LookupLang
from scripts.lang_embeddings import load_embedding_data as load_lang

from scripts.industry_embeddings import industry_tokenizer
from scripts.industry_embeddings import EmbeddingLookup as LookupIndustry
from scripts.industry_embeddings import load_embedding_data as load_industry

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
data_embed_lang = load_lang("../scripts/language_embeddings.pkl")
data_embed_industry = load_industry("../scripts/industry_embeddings.pkl")

lang_lookup = LookupLang(data_embed_lang)
industry_lookup = LookupIndustry(data_embed_industry)

Successfully loaded embedding data from '../scripts/language_embeddings.pkl'
Successfully loaded embedding data from '../scripts/industry_embeddings.pkl'
EmbeddingLookup initialized with 79 languages.
  Original embedding dimension: 384
  Latent embedding dimension: 10
EmbeddingLookup initialized with 85 languages.
  Original embedding dimension: 384
  Latent embedding dimension: 10


In [11]:
positives['SOURCE_LANG_task'] = positives['SOURCE_LANG_task'].apply(language_tokenizer).apply(lang_lookup.get_vector)
positives['TARGET_LANG_task'] = positives['TARGET_LANG_task'].apply(language_tokenizer).apply(lang_lookup.get_vector)

positives['INDUSTRY_task'] = positives['INDUSTRY_task'].apply(industry_tokenizer).apply(industry_lookup.get_vector)

In [12]:
columns_to_drop = [
    'TASK_ID',
    'QUALITY_EVALUATION',
    'SOURCE_LANG_translator',
    'TARGET_LANG_translator',
    'MANUFACTURER_INDUSTRY_translator',
]

positives.drop(columns=columns_to_drop, inplace=True)

In [13]:
positives

Unnamed: 0.1,Unnamed: 0,PM_task,TASK_TYPE_task,SOURCE_LANG_task,TARGET_LANG_task,TRANSLATOR_NAME,FORECAST_task,HOURLY_RATE_task,INDUSTRY_task,HOURLY_RATE_translator,TASK_TYPE_translator,PM_translator,FORECAST_mean,HOURLY_RATE_mean,QUALITY_EVALUATION_mean,SOURCE_LANG_EMBED_translator,TARGET_LANG_EMBED_translator,INDUSTRY_EMBED_translator
0,0,KMT,Engineering,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.061353415, -0.025645662, 0.11521641, -0.058...",Estela,0.25,24,"[0.008321971, -0.01131371, -0.012965765, -0.08...",25.745098,Engineering,BMT,0.485597,19.326389,6.950000,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...
1,1,KMT,Engineering,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Jeronimo,1.50,20,"[-0.060847882, -0.16297893, 0.06373858, -0.032...",18.000000,Engineering,KMT,1.500000,20.000000,7.000000,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.06084788 -0.16297893 0.06373858 -0.032359...
2,2,PMT,Engineering,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[0.049495008, -0.07142061, -0.005828036, -0.01...",Octavi,0.33,15,"[-0.051018845, 0.013300572, 0.061036862, -0.02...",24.729167,ProofReading,BMT,1.112889,21.599816,7.010120,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...
3,3,KMT,Management,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Ramiro Josafat,0.50,20,"[0.038529716, -0.017995264, -0.050278313, 0.03...",20.545455,ProofReading,BMT,3.870313,20.190521,6.854028,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...
4,4,PMT,Miscellaneous,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[0.049495008, -0.07142061, -0.005828036, -0.01...",Victor,0.00,11,"[0.005651269, 0.056426913, -0.016729122, -0.06...",18.000000,Engineering,BMT,0.776507,13.541485,6.973799,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31012,31012,BMT,ProofReading,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Salma,3.13,19,"[0.005651269, 0.056426913, -0.016729122, -0.06...",21.285714,ProofReading,BMT,3.157438,18.785791,6.897740,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...
31013,31013,PMT,Translation,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Fiamma Baldomero,0.59,18,"[-0.07727842, 0.002091013, 0.06725861, -0.0837...",18.600000,Translation,PMT,1.067879,17.595960,7.343434,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.01989399 0.08402714 -0.0744723 -0.027236...
31014,31014,PMT,ProofReading,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Gisela Ildefonso,0.54,15,"[-0.07727842, 0.002091013, 0.06725861, -0.0837...",13.538462,Translation,PMT,3.099965,14.045061,6.963605,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.04949501 -0.07142061 -0.00582804 -0.010367...,[-0.00962669 0.01952609 -0.02710957 -0.015226...
31015,31015,BMT,Translation,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",Ramiro Josafat,0.71,20,"[0.005651269, 0.056426913, -0.016729122, -0.06...",20.545455,ProofReading,BMT,3.870313,20.190521,6.854028,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...


In [14]:
import pickle
from sklearn.preprocessing import OneHotEncoder

In [15]:
positives['PM_task'] = positives['PM_task'].astype('category')
positives['PM_translator'] = positives['PM_translator'].astype('category')
positives['TRANSLATOR_NAME'] = positives['TRANSLATOR_NAME'].astype('category')
positives['TASK_TYPE_task'] = positives['TASK_TYPE_task'].astype('category')
positives['TASK_TYPE_translator'] = positives['TASK_TYPE_translator'].astype('category')

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder # Ensure other imports like StandardScaler are present if needed later

# Assume 'positives' is your initial DataFrame after the merge
# and before any encoding or scaling.

task_categorical = [
    'PM_task',
    'TASK_TYPE_task',
]

translator_categorical = [
    'PM_translator',
    'TASK_TYPE_translator',
]

task_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
translator_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

task_encoder.fit(positives[task_categorical].copy())
translator_encoder.fit(positives[translator_categorical].copy())

task_encoded_array = task_encoder.transform(positives[task_categorical].copy())
translator_encoded_array = translator_encoder.transform(positives[translator_categorical].copy())

task_categorical_vector_name = 'task_categorical_vector'
positives[task_categorical_vector_name] = pd.Series(list(task_encoded_array), index=positives.index).apply(np.array)

translator_categorical_vector_name = 'translator_categorical_vector'
positives[translator_categorical_vector_name] = pd.Series(list(translator_encoded_array), index=positives.index).apply(np.array)

columns_to_drop = task_categorical + translator_categorical + ['Unnamed: 0', 'TRANSLATOR_NAME']
positives.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [17]:
positives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SOURCE_LANG_task               31017 non-null  object 
 1   TARGET_LANG_task               31017 non-null  object 
 2   FORECAST_task                  31017 non-null  float64
 3   HOURLY_RATE_task               31017 non-null  int64  
 4   INDUSTRY_task                  31017 non-null  object 
 5   HOURLY_RATE_translator         31017 non-null  float64
 6   FORECAST_mean                  31017 non-null  float64
 7   HOURLY_RATE_mean               31017 non-null  float64
 8   QUALITY_EVALUATION_mean        31017 non-null  float64
 9   SOURCE_LANG_EMBED_translator   31017 non-null  object 
 10  TARGET_LANG_EMBED_translator   31017 non-null  object 
 11  INDUSTRY_EMBED_translator      31017 non-null  object 
 12  task_categorical_vector        31017 non-null 

In [18]:
from sklearn.preprocessing import StandardScaler

scalar_columns = [
    'FORECAST_task',
    'FORECAST_mean',
    'HOURLY_RATE_task',
    'HOURLY_RATE_translator',
    'HOURLY_RATE_mean',
    'QUALITY_EVALUATION_mean'
]

scalar = StandardScaler()

scalar_data = positives[scalar_columns].values
scalar.fit(scalar_data)
scalar_data = scalar.transform(scalar_data)

positives[scalar_columns] = scalar_data

In [19]:
positives

Unnamed: 0,SOURCE_LANG_task,TARGET_LANG_task,FORECAST_task,HOURLY_RATE_task,INDUSTRY_task,HOURLY_RATE_translator,FORECAST_mean,HOURLY_RATE_mean,QUALITY_EVALUATION_mean,SOURCE_LANG_EMBED_translator,TARGET_LANG_EMBED_translator,INDUSTRY_EMBED_translator,task_categorical_vector,translator_categorical_vector
0,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.061353415, -0.025645662, 0.11521641, -0.058...",-0.342461,1.412672,"[0.008321971, -0.01131371, -0.012965765, -0.08...",1.862779,-0.737378,0.099859,-0.228085,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
1,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",-0.197876,0.274143,"[-0.060847882, -0.16297893, 0.06373858, -0.032...",-0.358910,-0.462895,0.332182,0.106930,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.06084788 -0.16297893 0.06373858 -0.032359...,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
2,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[0.049495008, -0.07142061, -0.005828036, -0.01...",-0.333208,-1.149017,"[-0.051018845, 0.013300572, 0.061036862, -0.02...",1.571358,-0.567642,0.883945,0.174735,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...,"[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
3,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",-0.313544,0.274143,"[0.038529716, -0.017995264, -0.050278313, 0.03...",0.371256,0.178476,0.397891,-0.871124,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
4,"[0.049495008, -0.07142061, -0.005828036, -0.01...","[0.049495008, -0.07142061, -0.005828036, -0.01...",-0.371378,-2.287546,"[0.005651269, 0.056426913, -0.016729122, -0.06...",-0.358910,-0.658662,-1.895306,-0.068624,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.05101885 0.01330057 0.06103686 -0.022784...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31012,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",-0.009337,-0.010489,"[0.005651269, 0.056426913, -0.016729122, -0.06...",0.583600,-0.014418,-0.086588,-0.578246,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
31013,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",-0.303134,-0.295121,"[-0.07727842, 0.002091013, 0.06725861, -0.0837...",-0.186799,-0.579821,-0.496951,2.408045,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[-0.01989399 0.08402714 -0.0744723 -0.027236...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
31014,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",-0.308917,-1.149017,"[-0.07727842, 0.002091013, 0.06725861, -0.0837...",-1.638707,-0.029969,-1.721627,-0.136929,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.04949501 -0.07142061 -0.00582804 -0.010367...,[-0.00962669 0.01952609 -0.02710957 -0.015226...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
31015,"[-0.032043114, -0.05257272, 0.04719335, -0.044...","[0.07354073, -0.0826873, 0.13859886, -0.060899...",-0.289254,0.274143,"[0.005651269, 0.056426913, -0.016729122, -0.06...",0.371256,0.178476,0.397891,-0.871124,[-0.03204311 -0.05257272 0.04719335 -0.044367...,[ 0.07354073 -0.0826873 0.13859886 -0.060899...,[ 0.00565127 0.05642691 -0.01672912 -0.065863...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"


### Ensure proper dataypes

In [20]:
vector_string_cols = [
    'SOURCE_LANG_task',
    'TARGET_LANG_task',
    'INDUSTRY_task',
    'SOURCE_LANG_EMBED_translator',
    'TARGET_LANG_EMBED_translator',
    'INDUSTRY_EMBED_translator',
    'task_categorical_vector',      # The new condensed columns
    'translator_categorical_vector', # The new condensed columns
]

# Iterate through the identified columns and apply the conversion
for col in vector_string_cols:
    if col in positives.columns: # Check if the column exists in the dataframe
        print(f"Attempting to convert column '{col}' from string to list/array...")
        # Check if the column is of object dtype, which is typical for strings or mixed types
        if positives[col].dtype == 'object':
            try:
                # Use a lambda function with ast.literal_eval to convert each string cell
                # Add checks within the lambda:
                # 1. Check if the value is a string and not NaN (using pd.notna)
                # 2. Use ast.literal_eval to parse the string
                # 3. Convert the result to a NumPy array with float32 dtype
                # Handle cases where the value might already be a list/array (if conversion was partially done)
                positives[col] = positives[col].apply(
                    lambda x: np.array(ast.literal_eval(x), dtype=np.float32)
                    if isinstance(x, str) and pd.notna(x)
                    else (np.array(x, dtype=np.float32) if isinstance(x, (list, np.ndarray)) else x) # Keep existing arrays/lists, pass others through
                )
                print(f"Conversion successful for column '{col}'.")
            except Exception as e:
                # Catch potential errors during parsing (e.g., malformed string)
                warn(f"Warning: Could not convert column '{col}' from string to list/array. "
                     f"There might be problematic values. Error: {e}")
                # You might want to inspect the problematic rows or column content here
        else:
            print(f"Column '{col}' is not of object dtype ({positives[col].dtype}). Skipping string conversion.")
            # If it's already a numeric type, it's not a string vector.

    else:
        warn(f"Warning: Column '{col}' not found in the DataFrame. Skipping.")

Attempting to convert column 'SOURCE_LANG_task' from string to list/array...
Conversion successful for column 'SOURCE_LANG_task'.
Attempting to convert column 'TARGET_LANG_task' from string to list/array...
Conversion successful for column 'TARGET_LANG_task'.
Attempting to convert column 'INDUSTRY_task' from string to list/array...
Conversion successful for column 'INDUSTRY_task'.
Attempting to convert column 'SOURCE_LANG_EMBED_translator' from string to list/array...
Attempting to convert column 'TARGET_LANG_EMBED_translator' from string to list/array...
Attempting to convert column 'INDUSTRY_EMBED_translator' from string to list/array...
Attempting to convert column 'task_categorical_vector' from string to list/array...
Conversion successful for column 'task_categorical_vector'.
Attempting to convert column 'translator_categorical_vector' from string to list/array...
Conversion successful for column 'translator_categorical_vector'.




In [21]:
positives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31017 entries, 0 to 31016
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SOURCE_LANG_task               31017 non-null  object 
 1   TARGET_LANG_task               31017 non-null  object 
 2   FORECAST_task                  31017 non-null  float64
 3   HOURLY_RATE_task               31017 non-null  float64
 4   INDUSTRY_task                  31017 non-null  object 
 5   HOURLY_RATE_translator         31017 non-null  float64
 6   FORECAST_mean                  31017 non-null  float64
 7   HOURLY_RATE_mean               31017 non-null  float64
 8   QUALITY_EVALUATION_mean        31017 non-null  float64
 9   SOURCE_LANG_EMBED_translator   31017 non-null  object 
 10  TARGET_LANG_EMBED_translator   31017 non-null  object 
 11  INDUSTRY_EMBED_translator      31017 non-null  object 
 12  task_categorical_vector        31017 non-null 

### To Tensors

In [22]:
from torch.utils.data import Dataset, DataLoader

class PositivesDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

        self.pair_labels = self.dataframe.index.tolist()
        self.samples = []

        for i in range(len(dataframe)):
            self.samples.append((i, self.pair_labels[i], 0))
            self.samples.append((i, self.pair_labels[i], 1))

        self.task_features = [
            'SOURCE_LANG_task',
            'TARGET_LANG_task',
            'INDUSTRY_task',
            'FORECAST_task',
            'HOURLY_RATE_task',
            'task_categorical_vector',
        ]

        self.translator_features = [
            'SOURCE_LANG_EMBED_translator',
            'TARGET_LANG_EMBED_translator',
            'INDUSTRY_EMBED_translator',
            'FORECAST_mean',
            'HOURLY_RATE_mean',
            'HOURLY_RATE_translator',
            'QUALITY_EVALUATION_mean',
            'translator_categorical_vector',
        ]

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        original_row_idx, label, entity_type = self.samples[idx]
        row_data = self.dataframe.iloc[original_row_idx]

        if entity_type == 0:
            feature_list = []
            for col in self.task_features:
                col_data = row_data[col]
                if isinstance(col_data, (list, np.ndarray)):
                    feature_list.extend(col_data)
                elif isinstance(col_data, (int, float, np.number)):
                    feature_list.append(col_data)

            features = np.array(feature_list, dtype=np.float32)

        elif entity_type == 1:
            feature_list = []
            for col in self.translator_features:
                col_data = row_data[col]
                if isinstance(col_data, (list, np.ndarray)):
                    feature_list.extend(col_data)
                elif isinstance(col_data, (int, float, np.number)):
                    feature_list.append(col_data)
            features = np.array(feature_list, dtype=np.float32)

        else:
            raise ValueError("entity_type must be 0 or 1")
        
        features_tensor = torch.from_numpy(features)
        label_tensor = torch.tensor(label, dtype=torch.float32)
        entity_type_tensor = torch.tensor(entity_type, dtype=torch.float32)

        return features_tensor, label_tensor, entity_type_tensor

def collate_fn(batch):
    return tuple(zip(*batch))

In [23]:
dataset = PositivesDataset(positives)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

dataloader_iter = iter(dataloader)
batch = next(dataloader_iter)
features, labels, entity_types = batch
print(features[0].shape)
print(features[1].shape)

torch.Size([42])
torch.Size([12])


### Save DataFrame

In [24]:
positives.to_pickle('../data/positives.pkl')