In [2]:
# --- Imports ---
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import os

# Set the working directory
os.chdir('/Users/weronikadorociak/Documents/LSE/MY498 Capstone Project/ai_aging')


In [4]:
# --- Load data ---
eurostat = pd.read_csv("original_data/eurostat.csv")
anthropic_df = pd.read_csv("original_data/anthropic.csv")
nace = pd.read_csv("transformed_data/economic_activity_sector.csv")

In [5]:
# --- Extract unique economic activities (NACE codes) ---
economic_activities = eurostat["nace_r2"].unique()

In [6]:
# --- Define sentence embedding models to compare ---
models = [
    "all-MiniLM-L6-v2",
    "all-mpnet-base-v2",
    "paraphrase-multilingual-MiniLM-L12-v2",
    "sentence-transformers/bert-base-nli-mean-tokens",
]

In [7]:
# --- Initialize list to store similarity matching results ---
records = []

# --- Loop through each embedding model ---
for model_name in models:
    print(f"Processing model: {model_name}")
    
    # Load SentenceTransformer model
    model = SentenceTransformer(model_name)
    
    # Encode task names and economic activity labels
    task_emb = model.encode(anthropic_df['task_name'].tolist(), convert_to_tensor=True)
    econ_emb = model.encode(economic_activities.tolist(), convert_to_tensor=True)
    
    # Compute cosine similarity between all tasks and all economic activities
    sims = cosine_similarity(task_emb.cpu().numpy(), econ_emb.cpu().numpy())
    
    # Get index of the best matching economic activity for each task
    best_idx = sims.argmax(axis=1)
    
    # Store the best match per task along with the model name and percentage
    for i, task in enumerate(anthropic_df['task_name']):
        rec = {
            'task_name': task,
            model_name: economic_activities[best_idx[i]],
            'pct': anthropic_df.loc[i, 'pct']
        }
        records.append(rec)

Processing model: all-MiniLM-L6-v2
Processing model: all-mpnet-base-v2
Processing model: paraphrase-multilingual-MiniLM-L12-v2
Processing model: sentence-transformers/bert-base-nli-mean-tokens


In [8]:
# --- Convert raw records into a DataFrame ---
df_long = pd.DataFrame(records)

# --- Extract model-specific predictions (excluding nulls) ---
all_MiniLM_L6_v2 = df_long[df_long['all-MiniLM-L6-v2'].notnull()][['task_name', 'pct', 'all-MiniLM-L6-v2']]
all_mpnet_base_v2 = df_long[df_long['all-mpnet-base-v2'].notnull()][['task_name', 'pct', 'all-mpnet-base-v2']]
MiniLM_L12_v2 = df_long[df_long['paraphrase-multilingual-MiniLM-L12-v2'].notnull()][['task_name', 'pct', 'paraphrase-multilingual-MiniLM-L12-v2']]
bert = df_long[df_long['sentence-transformers/bert-base-nli-mean-tokens'].notnull()][['task_name', 'pct', 'sentence-transformers/bert-base-nli-mean-tokens']]

In [9]:
# --- Merge all predictions into one DataFrame ---
df_full = (
    all_MiniLM_L6_v2
    .merge(all_mpnet_base_v2, on=['task_name', 'pct'])
    .merge(MiniLM_L12_v2, on=['task_name', 'pct'])
    .merge(bert, on=['task_name', 'pct'])
)

# --- Count model agreement (how many models agree on the same category) ---
model_cols = [
    'all-MiniLM-L6-v2',
    'all-mpnet-base-v2',
    'paraphrase-multilingual-MiniLM-L12-v2',
    'sentence-transformers/bert-base-nli-mean-tokens'
]

def count_model_agreement(row):
    return row[model_cols].value_counts().max()

df_full['num_models_agree'] = df_full.apply(count_model_agreement, axis=1)

In [11]:
# --- Load ChatGPT fallback predictions ---
chatgpt = pd.read_excel("transformed_data/gen_ai_adoption_model_disagremment_chatgpt.xlsx")
chatgpt.columns = ["task_name", "chat_gpt_4o"]

# --- Merge ChatGPT predictions with model predictions ---
df = df_full.merge(chatgpt, on='task_name', how='outer')

# --- Define final prediction rule ---
def choose_final_prediction(row):
    if row['num_models_agree'] == 1:
        return row['chat_gpt_4o']  # Low agreement → use GPT fallback
    else:
        votes = row[model_cols].dropna().value_counts()
        if not votes.empty:
            return votes.idxmax()  # Majority vote
        else:
            return row['chat_gpt_4o']  # All null → fallback again

df['economic_activity_eurostat'] = df.apply(choose_final_prediction, axis=1)

# --- Remove rows with missing 'pct' (no participation % data) ---
df = df.dropna(subset=['pct'])

In [12]:
# --- Merge final predictions with NACE data to get sector names ---
ai_adoption = df.merge(nace, how='left', left_on='economic_activity_eurostat', right_on='Economic Activity')

# --- Aggregate AI adoption percentage by sector ---
ai_adoption = ai_adoption.groupby("sector")["pct"].sum().reset_index()
ai_adoption.columns = ["sector", "ai_adoption"]

# --- Save results to CSV ---
ai_adoption.to_csv("transformed_data/ai_adoption_sector.csv", index=False)