In [1]:
import pickle
import pandas as pd
import numpy as np
import	os

with open('language_embeddings.pkl', 'rb') as f:
    data = pickle.load(f)


lang_to_index = data['lang_to_index']
embeddings = data['original_embeddings']


df = pd.read_csv("../data/translators.csv")



In [2]:
collapsed_df = df.groupby('TRANSLATOR').agg(
    most_used_source_lang=('SOURCE_LANG', lambda x: x.mode()[0]),
    most_used_target_lang=('TARGET_LANG', lambda x: x.mode()[0]),
    avg_hourly_rate=('HOURLY_RATE', 'mean')
).reset_index()

# Display the resulting DataFrame
print(collapsed_df)

             TRANSLATOR most_used_source_lang most_used_target_lang  \
0                 Aaron               English                Basque   
1                 Abdon               English          Spanish (LA)   
2          Abdon Isaias               English     Spanish (Iberian)   
3            Abdon Luis               English     Spanish (Iberian)   
4            Abel Irene               English     Spanish (Iberian)   
..                  ...                   ...                   ...   
978      Zacarias Casio               English   Spanish (Argentina)   
979  Zacarias Marcelino               English      Spanish (Global)   
980             Zachary               Catalan               English   
981              Zlatan               English               Swedish   
982              Zlatko               English                 Croat   

     avg_hourly_rate  
0          20.700000  
1          18.500000  
2          16.375000  
3          16.000000  
4          17.000000  
..       

In [3]:
def get_embedding(lang):
    idx = lang_to_index.get(lang)
    if idx is not None:
        return embeddings[idx]
    else:
        return np.zeros(embeddings.shape[1])

In [4]:
collapsed_df['MOST_SOURCE_Embedding'] = collapsed_df["most_used_source_lang"].apply(get_embedding)
collapsed_df['MOST_TARGET_Embedding'] = collapsed_df['most_used_target_lang'].apply(get_embedding)
collapsed_df.drop(["most_used_source_lang", "most_used_target_lang"], axis=1, inplace=True)

collapsed_df

Unnamed: 0,TRANSLATOR,avg_hourly_rate,MOST_SOURCE_Embedding,MOST_TARGET_Embedding
0,Aaron,20.700000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.025308292, 0.0024069303, -0.024052639, 0.08..."
1,Abdon,18.500000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Abdon Isaias,16.375000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Abdon Luis,16.000000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Abel Irene,17.000000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
978,Zacarias Casio,18.181818,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
979,Zacarias Marcelino,20.600000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
980,Zachary,22.000000,"[0.011650571, -0.035290457, 0.009649627, 0.061...","[0.043511786, -0.026500577, 0.0035570136, 0.00..."
981,Zlatan,40.000000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0060621207, 0.019862145, -0.02871672, 0.001..."


In [None]:
df_task = pd.read_csv("../data/data_enhanced.csv")


indus_count = df_task.groupby("TRANSLATOR")["MANUFACTURER_INDUSTRY"].value_counts().reset_index(name="count")
most_indus = indus_count.loc[indus_count.groupby("TRANSLATOR")["count"].idxmax()]


subindus_count = df_task.groupby("TRANSLATOR")["MANUFACTURER_SUBINDUSTRY"].value_counts().reset_index(name="count")
most_subindus = subindus_count.loc[subindus_count.groupby("TRANSLATOR")["count"].idxmax()]



collapsed_df = pd.merge(collapsed_df, most_indus[['TRANSLATOR', 'MANUFACTURER_INDUSTRY']], on="TRANSLATOR", how="left")
collapsed_df = pd.merge(collapsed_df, most_subindus[['TRANSLATOR', 'MANUFACTURER_SUBINDUSTRY']], on="TRANSLATOR", how="left")

collapsed_df


Unnamed: 0,TRANSLATOR,avg_hourly_rate,MOST_SOURCE_Embedding,MOST_TARGET_Embedding,MANUFACTURER_INDUSTRY,MANUFACTURER_SUBINDUSTRY
0,Aaron,20.700000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.025308292, 0.0024069303, -0.024052639, 0.08...",Internet Services & Infrastructure,Internet Services & Infrastructure
1,Abdon,18.500000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
2,Abdon Isaias,16.375000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals"
3,Abdon Luis,16.000000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
4,Abel Irene,17.000000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
...,...,...,...,...,...,...
978,Zacarias Casio,18.181818,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
979,Zacarias Marcelino,20.600000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Health Care Facilities,Long-Term Care Facilities
980,Zachary,22.000000,"[0.011650571, -0.035290457, 0.009649627, 0.061...","[0.043511786, -0.026500577, 0.0035570136, 0.00...",,
981,Zlatan,40.000000,"[0.043511786, -0.026500577, 0.0035570136, 0.00...","[0.0060621207, 0.019862145, -0.02871672, 0.001...",,


In [27]:
missing_in_source = collapsed_df[~collapsed_df['TRANSLATOR'].isin(most_indus['TRANSLATOR'])]
missing_in_target = collapsed_df[~collapsed_df['TRANSLATOR'].isin(most_subindus['TRANSLATOR'])]

print("Translators missing in source data:", len(missing_in_source['TRANSLATOR'].unique()))
print("Translators missing in target data:", len(missing_in_target['TRANSLATOR'].unique()))
print("\nAll translators in translators.csv:", len(collapsed_df))
print("Number of unique translators in the task CSV:", df_task['TRANSLATOR'].nunique())


percentage = (df_task['TRANSLATOR'].nunique() / len(collapsed_df)) * 100
print(f"Percentage of translators in the task CSV that are also in translators CSV: {percentage:.1f}%")

Translators missing in source data: 755
Translators missing in target data: 755

All translators in translators.csv: 983
Number of unique translators in the task CSV: 228
Percentage of translators in the task CSV that are also in translators CSV: 23.2%
