In [1]:
# --- Imports ---
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# --- Load data ---
eurostat = pd.read_csv('original_data/eurostat.csv')
gpts_df = pd.read_csv('original_data/gpts_are_gpts.csv')
nace = pd.read_csv('transformed_data/economic_activity_sector.csv')

In [4]:
# --- Prepare economic activity categories ---
economic_activities = eurostat["nace_r2"].unique()

# --- Define sentence embedding models to compare ---
models = [
    "all-MiniLM-L6-v2",
    "all-mpnet-base-v2",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "sentence-transformers/bert-base-nli-mean-tokens",
]

In [None]:
# --- Initialize list to store similarity matching results ---
records = []

# --- Loop through each embedding model ---
for model_name in models:
    print(f"Processing model: {model_name}")
    
    # Load SentenceTransformer model
    model = SentenceTransformer(model_name)
    
    # Encode task names and economic activity labels
    task_emb = model.encode(gpts_df['Title'].tolist(), convert_to_tensor=True)
    econ_emb = model.encode(economic_activities.tolist(), convert_to_tensor=True)
    
    # Compute cosine similarity between all tasks and all economic activities
    sims = cosine_similarity(task_emb.cpu().numpy(), econ_emb.cpu().numpy())
    
    # Get index of the best matching economic activity for each task
    best_idx = sims.argmax(axis=1)
    
    # Store the best match per task along with the model name and percentage
    for i, task in enumerate(gpts_df['Title']):
        rec = {
            'Title': task,
            model_name: economic_activities[best_idx[i]],
            'human_beta': gpts_df.loc[i, 'human_beta']
        }
        records.append(rec)

In [6]:
# --- Convert raw records into a DataFrame ---
df_long = pd.DataFrame(records)

# --- Extract model-specific predictions (excluding nulls) ---
all_MiniLM_L6_v2 = df_long[df_long['all-MiniLM-L6-v2'].notnull()][['Title', 'human_beta', 'all-MiniLM-L6-v2']]
all_mpnet_base_v2 = df_long[df_long['all-mpnet-base-v2'].notnull()][['Title', 'human_beta', 'all-mpnet-base-v2']]
MiniLM_L12_v2 = df_long[df_long['paraphrase-multilingual-MiniLM-L12-v2'].notnull()][['Title', 'human_beta', 'paraphrase-multilingual-MiniLM-L12-v2']]
bert = df_long[df_long['sentence-transformers/bert-base-nli-mean-tokens'].notnull()][['Title', 'human_beta', 'sentence-transformers/bert-base-nli-mean-tokens']]

In [7]:
# --- Merge all predictions into one DataFrame ---
df_full = (
    all_MiniLM_L6_v2
    .merge(all_mpnet_base_v2, on=['Title', 'human_beta'])
    .merge(MiniLM_L12_v2, on=['Title', 'human_beta'])
    .merge(bert, on=['Title', 'human_beta'])
)

# --- Count model agreement (how many models agree on the same category) ---
model_cols = [
    'all-MiniLM-L6-v2',
    'all-mpnet-base-v2',
    'paraphrase-multilingual-MiniLM-L12-v2',
    'sentence-transformers/bert-base-nli-mean-tokens'
]

def count_model_agreement(row):
    return row[model_cols].value_counts().max()

df_full['num_models_agree'] = df_full.apply(count_model_agreement, axis=1)

In [8]:
# --- Load ChatGPT fallback predictions for disagreement cases ---
chatgpt = pd.read_excel("transformed_data/gen_ai_potential_model_disagremment_chatgpt.xlsx")
chatgpt.columns = ["Title", "chat_gpt_4o"]

# --- Merge ChatGPT predictions with model predictions ---
df = df_full.merge(chatgpt, on='Title', how='outer')

# --- Define final prediction rule ---
def choose_final_prediction(row):
    if row['num_models_agree'] == 1:
        return row['chat_gpt_4o']  # Low agreement → use GPT fallback
    else:
        votes = row[model_cols].dropna().value_counts()
        if not votes.empty:
            return votes.idxmax()  # Majority vote
        else:
            return row['chat_gpt_4o']  # All null → fallback again

df['economic_activity_eurostat'] = df.apply(choose_final_prediction, axis=1)

# --- Remove rows with missing 'pct' (no participation % data) ---
df = df.dropna(subset=['human_beta'])

In [9]:
# --- Merge with NACE sector info and calculate mean AI potential ---
ai_potential = df.merge(nace, how='left', left_on='economic_activity_eurostat', right_on='Economic Activity')
ai_potential = ai_potential.groupby("sector")["human_beta"].mean().reset_index()
ai_potential["human_beta"] = ai_potential["human_beta"] * 100  # Convert to percentage
ai_potential.columns = ["sector", "ai_potential"]

# --- Export result to CSV ---
ai_potential.to_csv("transformed_data/ai_potential_sector.csv", index=False)