In [1]:
# --- Imports ---
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# --- Load data ---
eurostat = pd.read_csv('data/eurostat.csv')
gpts_df = pd.read_csv('data/gpts_are_gpts.csv')
nace = pd.read_csv('data/economic_activity_sector.csv')

In [3]:
# --- Prepare economic activity categories ---
economic_activities = eurostat["nace_r2"].unique()

# --- Define sentence embedding models to compare ---
models = [
    "all-MiniLM-L6-v2",
    "all-mpnet-base-v2",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "sentence-transformers/bert-base-nli-mean-tokens",
]

NameError: name 'eurostat' is not defined

In [2]:
# --- Perform semantic matching for each model ---
records = []
for model_name in models:
    print(f"Processing {model_name}")
    model = SentenceTransformer(model_name)

    task_emb = model.encode(gpts_df['Title'].tolist(), convert_to_tensor=True)
    econ_emb = model.encode(economic_activities.tolist(), convert_to_tensor=True)

    sims = cosine_similarity(task_emb.cpu().numpy(), econ_emb.cpu().numpy())
    best_idx = sims.argmax(axis=1)

    for i, task in enumerate(gpts_df['Title']):
        rec = {
            'Title': task,
            model_name: economic_activities[best_idx[i]],
            'human_beta': gpts_df.loc[i, 'human_beta']
        }
        records.append(rec)

NameError: name 'models' is not defined

In [4]:
# --- Extract and merge predictions from all models ---
all_MiniLM_L6_v2 = df_long[['Title', 'human_beta', 'all-MiniLM-L6-v2']].dropna()
all_mpnet_base_v2 = df_long[['Title', 'human_beta', 'all-mpnet-base-v2']].dropna()
MiniLM_L12_v2 = df_long[['Title', 'human_beta', 'paraphrase-multilingual-MiniLM-L12-v2']].dropna()
bert = df_long[['Title', 'human_beta', 'sentence-transformers/bert-base-nli-mean-tokens']].dropna()

df_full = (
    all_MiniLM_L6_v2
    .merge(all_mpnet_base_v2, on=['Title', 'human_beta'])
    .merge(MiniLM_L12_v2, on=['Title', 'human_beta'])
    .merge(bert, on=['Title', 'human_beta'])
)

NameError: name 'df_long' is not defined

In [None]:
# --- Count model agreement per task ---
model_cols = [
    'all-MiniLM-L6-v2',
    'all-mpnet-base-v2',
    'paraphrase-multilingual-MiniLM-L12-v2',
    'sentence-transformers/bert-base-nli-mean-tokens'
]


def count_model_agreement(row):
    return row[model_cols].value_counts().max()


df_full['num_models_agree'] = df_full.apply(count_model_agreement, axis=1)

In [None]:
# --- Load ChatGPT fallback predictions for disagreement cases ---
chatgpt = pd.read_excel("data/disagremment_chatgpt_gpts.xlsx")

# --- Merge model predictions with ChatGPT 4o output ---
df = df_full.merge(chatgpt, on='Title', how='outer')


# --- Decide final label for each task ---
def choose_final_prediction(row):
    if row['num_models_agree'] == 1:
        return row['chat_gpt_4o']
    else:
        votes = row[model_cols].dropna().value_counts()
        if not votes.empty:
            return votes.idxmax()
        else:
            return row['chat_gpt_4o']  # Fallback if all are NaN


df['economic_activity_eurostat'] = df.apply(choose_final_prediction, axis=1)

# --- Filter valid records with non-null human_beta ---
df = df.dropna(subset=['human_beta'])

In [None]:
# --- Merge with NACE sector info and calculate mean AI potential ---
ai_potential = df.merge(nace, how='left', left_on='economic_activity_eurostat', right_on='Economic Activity')
ai_potential = ai_potential.groupby("sector")["human_beta"].mean().reset_index()
ai_potential["human_beta"] = ai_potential["human_beta"] * 100  # Convert to percentage
ai_potential.columns = ["sector", "ai_potential"]

# --- Export result to CSV ---
ai_potential.to_csv("data/ai_potential_sector.csv", index=False)