# Method 1: Agent-Client Matching via Clustering, Classification, and Ranking

In [None]:
#Import and load the data
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path
import prince
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

base_dir = Path(__file__).parents[2]
df = pd.read_csv(base_dir / './2025-NUS-datathon/data/merged_data.csv')

In [None]:
#Preprocessing
# 0:Preprocessing
########################################################################

# 0.1: Drop blank and NaN
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna(axis=0)

# 0.2: Encode agent expertise
expertise_list = ['prod_0', 'prod_2', 'prod_4', 'prod_6', 'prod_7', 'prod_8', 'prod_9']
for expertise in expertise_list:
    df['agent_expertise_' + expertise] = df['agent_product_expertise'].apply(lambda x: 1 if expertise in x else 0)
df.drop(columns=['agent_product_expertise'], inplace=True)


## 1: Separate dataframes
################################################################################################################################################

#Select GOOD Rows eg. in force and not expired
assessment_columns=['annual_premium','flg_main', 'flg_rider', 'flg_inforce', 'flg_lapsed',
       'flg_cancel', 'flg_expire', 'flg_converted']

print(df.shape)
df=df[df['flg_expire']!=1]
print(df.shape)
df=df[df['flg_lapsed']!=1]
print(df.shape)
df=df[df['annual_premium']>0]
print(df.shape)

client_interest_columns=['annual_premium',
       'product',  'product_grp',
        'agent_age',
       'agent_gender', 'agent_marital', 'agent_tenure', 'cnt_converted',
       'annual_premium_cnvrt', 'pct_lapsed', 'pct_cancel', 'pct_inforce',

       'pct_prod_0_cnvrt', 'pct_prod_1_cnvrt', 'pct_prod_2_cnvrt',
       'pct_prod_3_cnvrt', 'pct_prod_4_cnvrt', 'pct_prod_5_cnvrt',
       'pct_prod_6_cnvrt', 'pct_prod_7_cnvrt', 'pct_prod_8_cnvrt',
       'pct_prod_9_cnvrt', 

       'pct_SX0_unknown', 'pct_SX1_male', 'pct_SX2_female',
       'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29',
       'pct_AG04_30to34', 'pct_AG05_35to39', 'pct_AG06_40to44',
       'pct_AG07_45to49', 'pct_AG08_50to54', 'pct_AG09_55to59',
       'pct_AG10_60up', 
       'cluster',

       'agent_expertise_prod_0', 'agent_expertise_prod_2',
       'agent_expertise_prod_4', 'agent_expertise_prod_6',
       'agent_expertise_prod_7', 'agent_expertise_prod_8',
       'agent_expertise_prod_9']

agent_interest_columns=['cust_age_at_purchase_grp', 'cust_tenure_at_purchase_grp','cltsex',
                         'marryd', 'race_desc_map', 'cltpcode', 'household_size',
                        'economic_status', 'family_size', 'household_size_grp',
                        'family_size_grp']


In [None]:
#Unsupervised Clustering with GMM

## 2: Cluster
################################################################################################################################################

# 2.1: Use PCA MCA before clustering
########################################################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn import datasets
from sklearn.mixture import GaussianMixture
from prince import *
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

#encode client data
##########################################
d=df[client_interest_columns]

#find df for mca
clustering_categorical_columns = d.select_dtypes(include=['object']).columns

clustering_encoded_df=pd.get_dummies(d, columns=clustering_categorical_columns)
clustering_encoded_df=clustering_encoded_df.astype(float)
# print(clustering_encoded_df.columns)

#drop numerical columns
clustering_numerical_columns = ['annual_premium', 'agent_age', 'agent_tenure', 'cnt_converted',
       'annual_premium_cnvrt', 'pct_lapsed', 'pct_cancel', 'pct_inforce',
       'pct_prod_0_cnvrt', 'pct_prod_1_cnvrt', 'pct_prod_2_cnvrt',
       'pct_prod_3_cnvrt', 'pct_prod_4_cnvrt', 'pct_prod_5_cnvrt',
       'pct_prod_6_cnvrt', 'pct_prod_7_cnvrt', 'pct_prod_8_cnvrt',
       'pct_prod_9_cnvrt', 'pct_SX0_unknown', 'pct_SX1_male', 'pct_SX2_female',
       'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29',
       'pct_AG04_30to34', 'pct_AG05_35to39', 'pct_AG06_40to44',
       'pct_AG07_45to49', 'pct_AG08_50to54', 'pct_AG09_55to59',
       'pct_AG10_60up', 'cluster']

print('/////////////////////////////')
print(clustering_encoded_df.shape)
clustering_encoded_df=clustering_encoded_df.drop(columns= clustering_numerical_columns)
print(clustering_encoded_df.shape)

#The following dataframe is used for mca
print(clustering_encoded_df.head())
print(clustering_encoded_df.columns)

print('checkpoint 1')

#perform mca on categorical data
mca1 = prince.MCA(n_components=10, copy=True, check_input=True, engine='sklearn', random_state=42)
mca1 = mca1.fit(clustering_encoded_df)
clustering_df_mca = mca1.transform(clustering_encoded_df)
print(clustering_df_mca.head())



clustering_df_numeric=d[clustering_numerical_columns]





In [None]:
#Missing code

print(clustering_df_numeric.head())
#rejoin mca and numerical for random forest predictions

gmm_df=pd.concat([clustering_df_mca,clustering_df_numeric],axis=1)

#convert column names to string
gmm_df.columns = gmm_df.columns.astype(str)

In [None]:
# # # 3: Which agents are in which clusters?
# # ################################################################################################################################################
# # find_agent_df=pd.concat([gmm_df['client_interest_grp'],df['agntnum']],axis=1)

# # print('The following displays agents and their new lable')
# # print(find_agent_df.head())

In [None]:
# # 4 Input into random forest
# ################################################################################################################################################

#Select client specific data and set cluster lable: 'client_interest_label' as Random Forest target
print('This is the df used for random forest:')
pre_gmm_df=gmm_df.drop(columns=['client_interest_grp'])
print(pre_gmm_df.head())
print(pre_gmm_df.columns)

#############################
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming you have a target variable 'target'

target = gmm_df['client_interest_grp']  # Replace 'target' with the actual target column name

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(pre_gmm_df, target, test_size=0.2, random_state=42)


# Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42,max_depth=4,
                             n_estimators=100,min_samples_leaf=10)
clf.fit(X_train, y_train)


# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('////////////////////////////////////////////////////////////')
print(f'Accuracy in predicting {target}: {accuracy}')

#add clustering result to d
# d['client_interest_grp']=gmm_df['client_interest_grp'].copy()
d.loc[:, 'client_interest_grp'] = gmm_df['client_interest_grp'].copy()


print('//////////////////////////////////////////////')
print(d.head())
print(d.columns)


In [None]:

## Uncomment module to use pca
################################################################################################

# # Handle missing values (fill with mean for numerical columns)
# clustering_df_numeric.fillna(clustering_df_numeric.mean(), inplace=True)

# # Standardize the numeric data
# scaler = StandardScaler()
# df_scaled = scaler.fit_transform(clustering_df_numeric)

# # Apply PCA
# pca = PCA(n_components=0.95)  # Retain 95% variance
# df_pca = pca.fit_transform(df_scaled)
# df_pca = pd.DataFrame(df_pca, columns=[f'PC{i+1}' for i in range(df_pca.shape[1])])

# # print(df_pca.head())

# clustering_df_numeric=df_pca

# clustering_df_mca = clustering_df_mca.reset_index(drop=True)
# clustering_df_numerical = clustering_df_numeric.reset_index(drop=True)

# print(clustering_df_mca.index.equals(clustering_df_numeric.index))
# ###############################################################################################





In [None]:
# # 4 Input into random forest prt 2
# ################################################################################################################################################

print(clustering_df_numeric.head())
#rejoin mca and numerical for random forest predictions

# print(clustering_df_mca.shape)
# print(clustering_df_numerical.shape)

gmm_df=pd.concat([clustering_df_mca,clustering_df_numeric],axis=1)



#convert column names to string
gmm_df.columns = gmm_df.columns.astype(str)



## 2.2: Fit post mca data to GMM
########################################################################
print('This is the dataframe used for GMM clustering:')
print(gmm_df.head)
print(gmm_df.columns)

gmm = GaussianMixture(n_components = 10)

print('checkpoint 2')

# Fit the GMM model for the dataset 

gmm.fit(gmm_df)

print('checkpoint 2')
# Assign a label to each sample
labels = gmm.predict(gmm_df)

gmm_df['client_interest_grp']= labels

# print the converged log-likelihood value
print(gmm.lower_bound_)
 
# print the number of iterations needed
# for the log-likelihood value to converge
print(gmm.n_iter_)
print('checkpoint 3')

print('This is the dataframe with the new cluster lable:')
print(gmm_df.head())
print(gmm_df['client_interest_grp'].value_counts())

In [None]:

## 5: XGBoost Rank Model for Agent Recommendation Within Clusters
#################################################################################################################3

import xgboost as xgb
from sklearn.model_selection import train_test_split

# Ensure 'client_interest_grp' is assigned to df
df = df.merge(d[['client_interest_grp']], left_index=True, right_index=True, how='left')

# Define ranking score (modify as needed)
df['ranking_score'] = df['cnt_converted'] / (df['pct_inforce'] + 1e-5)  # Example: Conversion Rate

# Features for ranking (agent-specific & client-agent interaction features)
ranking_features = ['cnt_converted', 'annual_premium_cnvrt', 'pct_lapsed', 
                    'pct_cancel', 'pct_inforce', 'pct_prod_0_cnvrt', 'pct_prod_1_cnvrt', 
                    'pct_prod_2_cnvrt', 'pct_prod_3_cnvrt', 'pct_prod_4_cnvrt','pct_prod_5_cnvrt',
                    'pct_prod_6_cnvrt', 'pct_prod_7_cnvrt', 'pct_prod_8_cnvrt', 'pct_prod_9_cnvrt']

# Prepare ranking data
X = df[ranking_features]
y = df['ranking_score']
group = df['client_interest_grp'].values  # One value per row

# Train-test split
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    X, y, group, test_size=0.2, random_state=42, stratify=None
)

group_train_sizes = np.unique(group_train, return_counts=True)[1]
group_test_sizes = np.unique(group_test, return_counts=True)[1]

# Convert to DMatrix for XGBoost Rank
train_data = xgb.DMatrix(X_train, label=y_train)
train_data.set_group(group_train_sizes)
test_data = xgb.DMatrix(X_test, label=y_test)
test_data.set_group(group_test_sizes)
print(f"Length of X_train: {len(X_train)}, Sum of group_train_sizes: {sum(group_train_sizes)}")
print(f"Length of X_test: {len(X_test)}, Sum of group_test_sizes: {sum(group_test_sizes)}")

# XGBoost Rank model parameters
params = {
    'objective': 'rank:pairwise',  # or 'rank:ndcg' for better performance
    'eta': 0.1,
    'max_depth': 6,
    'eval_metric': 'ndcg'
}

# Train XGBoost Rank model
rank_model = xgb.train(params, train_data, num_boost_round=100)

# Predict agent rankings for test data
y_pred = rank_model.predict(test_data)

# Assign predicted scores to agents for ranking
df.loc[X_test.index, 'rank_score'] = y_pred

# Rank agents within each client cluster
df['rank_within_cluster'] = df.groupby('client_interest_grp')['rank_score'].rank(ascending=False)

# Select the top agent per cluster for recommendation
top_agents = df.loc[df['rank_within_cluster'] == 1, ['client_interest_grp', 'agntnum', 'rank_score']]

print(top_agents)
print("Top recommended agents have been identified!")


In [None]:
# Eval
###########################################################################################

# Import necessary libraries for evaluation
from sklearn.metrics import silhouette_score, davies_bouldin_score, accuracy_score, classification_report, ndcg_score
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Load the dataset
df = pd.read_csv("../data/merged_data.csv")

# Data Preprocessing
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.dropna(axis=0)

# Encode agent expertise
expertise_list = ['prod_0', 'prod_2', 'prod_4', 'prod_6', 'prod_7', 'prod_8', 'prod_9']
for expertise in expertise_list:
    df[f'agent_expertise_{expertise}'] = df['agent_product_expertise'].apply(lambda x: 1 if expertise in str(x) else 0)
df.drop(columns=['agent_product_expertise'], inplace=True)

# Filter relevant rows
df = df[(df['flg_expire'] != 1) & (df['flg_lapsed'] != 1) & (df['annual_premium'] > 0)]

# Define client interest columns
client_interest_columns = [
    'annual_premium', 'product', 'product_grp', 'agent_age', 'agent_gender', 'agent_marital', 'agent_tenure', 
    'cnt_converted', 'annual_premium_cnvrt', 'pct_lapsed', 'pct_cancel', 'pct_inforce', 
    'pct_prod_0_cnvrt', 'pct_prod_1_cnvrt', 'pct_prod_2_cnvrt', 'pct_prod_3_cnvrt', 'pct_prod_4_cnvrt', 
    'pct_prod_5_cnvrt', 'pct_prod_6_cnvrt', 'pct_prod_7_cnvrt', 'pct_prod_8_cnvrt', 'pct_prod_9_cnvrt', 
    'pct_SX0_unknown', 'pct_SX1_male', 'pct_SX2_female', 'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29', 
    'pct_AG04_30to34', 'pct_AG05_35to39', 'pct_AG06_40to44', 'pct_AG07_45to49', 'pct_AG08_50to54', 
    'pct_AG09_55to59', 'pct_AG10_60up', 'cluster'
]

d = df[client_interest_columns].copy()

# One-hot encode categorical features
d = pd.get_dummies(d, drop_first=True)
d = d.astype(float)

# MCA Transformation
import prince
mca = prince.MCA(n_components=10, random_state=42)
mca.fit(d)
mca_transformed = mca.transform(d)

# Clustering with GMM
gmm = GaussianMixture(n_components=10, random_state=42)
gmm.fit(mca_transformed)

# Assign cluster labels
d['client_interest_grp'] = gmm.predict(mca_transformed)

# Clustering Evaluation
silhouette_avg = silhouette_score(mca_transformed, d['client_interest_grp'])
davies_bouldin = davies_bouldin_score(mca_transformed, d['client_interest_grp'])
log_likelihood = gmm.lower_bound_

# Prepare data for classification
pre_gmm_df = mca_transformed
target = d['client_interest_grp']

# Split data for classification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pre_gmm_df, target, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf = RandomForestClassifier(random_state=42, max_depth=4, n_estimators=100, min_samples_leaf=10)
clf.fit(X_train, y_train)

# Classification Evaluation
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, output_dict=True)

# Ranking Model (XGBoost Ranker)
df['ranking_score'] = df['cnt_converted'] / (df['pct_inforce'] + 1e-5)

ranking_features = ['cnt_converted', 'annual_premium_cnvrt', 'pct_lapsed', 'pct_cancel', 'pct_inforce', 
                    'pct_prod_0_cnvrt', 'pct_prod_1_cnvrt', 'pct_prod_2_cnvrt', 'pct_prod_3_cnvrt', 'pct_prod_4_cnvrt']

X = df[ranking_features]
y = df['ranking_score']
group = d['client_interest_grp'].values

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    X, y, group, test_size=0.2, random_state=42, stratify=None
)

group_train_sizes = np.unique(group_train, return_counts=True)[1]
group_test_sizes = np.unique(group_test, return_counts=True)[1]

# Convert to DMatrix
train_data = xgb.DMatrix(X_train, label=y_train)
train_data.set_group(group_train_sizes)
test_data = xgb.DMatrix(X_test, label=y_test)
test_data.set_group(group_test_sizes)

# Train XGBoost Ranker
params = {'objective': 'rank:pairwise', 'eta': 0.1, 'max_depth': 6, 'eval_metric': 'ndcg'}
rank_model = xgb.train(params, train_data, num_boost_round=100)

# Predict rankings
y_pred_rank = rank_model.predict(test_data)

# Compute NDCG@5
ndcg_score_5 = ndcg_score([y_test], [y_pred_rank], k=5)

# Compute MRR
reciprocal_ranks = [1 / (np.where(np.argsort(y_pred_rank)[::-1] == i)[0][0] + 1) for i in range(len(y_test))]
mrr = np.mean(reciprocal_ranks)

# Store results
results = {
    "Clustering": {
        "Silhouette Score": silhouette_avg,
        "Davies-Bouldin Index": davies_bouldin,
        "Log-Likelihood": log_likelihood
    },
    "Classification": {
        "Accuracy": accuracy,
        "Precision": classification_rep["weighted avg"]["precision"],
        "Recall": classification_rep["weighted avg"]["recall"],
        "F1-score": classification_rep["weighted avg"]["f1-score"]
    },
    "Ranking": {
        "NDCG@5": ndcg_score_5,
        "MRR": mrr
    }
}

print(results)

# Method 2: Representation Learning-Based Agent-Client Matching

In [None]:
## cosine_similarity.py
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import os


# Define file paths using `project_root`.
# The key change is here:  project_root is NOW correct.
df_agent = pd.read_csv("../data/agent_data.csv")
df_client = pd.read_csv("../data/client_data.csv")
df_policy = pd.read_csv("../data/policy_data.csv")
df = pd.read_csv("../data/encoded_data_with_id_names_1_tovector.csv")


# Define agent and customer features
agent_feature_cols = [
    'agntnum', 'pct_SX1_male', 'agent_marital_S', 'agent_marital_U', 'agent_marital_W', 'agent_marital_M', 'agent_marital_P', 'agent_marital_D',
    'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29', 'pct_AG04_30to34',
    'pct_AG05_35to39', 'pct_AG06_40to44', 'pct_AG07_45to49',
    'pct_AG08_50to54', 'pct_AG09_55to59', 'pct_AG10_60up', 
    'pct_prod_0_cnvrt', 'pct_prod_1_cnvrt', 'pct_prod_2_cnvrt', 'pct_prod_3_cnvrt', 
    'pct_prod_4_cnvrt', 'pct_prod_5_cnvrt', 'pct_prod_6_cnvrt','pct_prod_7_cnvrt', 
    'pct_prod_8_cnvrt', 'pct_prod_9_cnvrt',
    'agent_expertise_prod_0', 'agent_expertise_prod_2', 'agent_expertise_prod_4', 'agent_expertise_prod_6',
    'agent_expertise_prod_7', 'agent_expertise_prod_8', 'agent_expertise_prod_9',
    'economic_status_avg', 'household_size_avg', 'family_size_avg',
    'net_indicator'
]

customer_feature_cols = [
    'secuityno', 'cltsex_M', 'marryd_S', 'marryd_U', 'marryd_W', 'marryd_M', 'marryd_P', 'marryd_D',
    'Cpct_AG01_lt20', 'Cpct_AG02_20to24', 'Cpct_AG03_25to29', 'Cpct_AG04_30to34',
    'Cpct_AG05_35to39', 'Cpct_AG06_40to44', 'Cpct_AG07_45to49',
    'Cpct_AG08_50to54', 'Cpct_AG09_55to59', 'Cpct_AG10_60up', 
    'product_prod_0', 'product_prod_1', 'product_prod_2', 'product_prod_3',
    'product_prod_4', 'product_prod_5', 'product_prod_6', 'product_prod_7', 
    'product_prod_8', 'product_prod_9',
    'product_prod_0', 'product_prod_2', 'product_prod_4', 'product_prod_6',
    'product_prod_7', 'product_prod_8', 'product_prod_9',
    'economic_status', 'household_size', 'family_size',
    'inforce'
]
'''
'cnt_converted', 'annual_premium_cnvrt','annual_premium‘
'agent_tenure' 'cust_tenure_at_purchase_grp_encoded',
'''

# Extract agent and customer data
df_agent = df[agent_feature_cols].fillna(0)
df_agent = df_agent.drop_duplicates()
df_customer = df[customer_feature_cols].fillna(0)
df_customer = df_customer.drop_duplicates()


def recommend_agents_unsupervised(secuityno, df_customer, df_agent, top_k=3):
    # Ensure customer_id exists in df_customer
    if secuityno not in df_customer['secuityno'].values:
        raise ValueError(f"Customer ID {secuityno} not found in dataset")

    # Get the customer profile
    customer_profile = df_customer[df_customer['secuityno'] == secuityno].drop(columns=['secuityno']).values
    
    # Get agent profiles
    agent_profiles = df_agent.drop(columns=['agntnum']).values
    
    # Compute cosine similarity between the customer and all agents
    similarities = cosine_similarity(customer_profile, agent_profiles)[0]
    
    # Get indices of the most similar agents
    sorted_indices = np.argsort(similarities)[::-1][:top_k]
    
    # Get recommended agents
    recommended_agents = df_agent.iloc[sorted_indices][['agntnum']]
    
    return recommended_agents

# Example usage:
recommended_unsupervised = recommend_agents_unsupervised(
    secuityno= 'CIN:2161', 
    df_customer=df_customer, 
    df_agent=df_agent, 
    top_k=3,
)
print("\nUnsupervised Recommended Agents:")
print(recommended_unsupervised)

In [None]:
## cosine_similarity_embedding.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import os

# Load the CSV file

# Define file paths using `base_dir` (No need for `./` before the path)
df = pd.read_csv("../data/encoded_data_with_id_names_1_tovector.csv")

agent_feature_cols = [
    'agntnum', 'pct_SX1_male', 'agent_marital_S', 'agent_marital_U', 'agent_marital_W', 'agent_marital_M', 'agent_marital_P', 'agent_marital_D',
    'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29', 'pct_AG04_30to34',
    'pct_AG05_35to39', 'pct_AG06_40to44', 'pct_AG07_45to49',
    'pct_AG08_50to54', 'pct_AG09_55to59', 'pct_AG10_60up', 
    'economic_status_avg', 'household_size_avg', 'family_size_avg',
    'net_indicator'
]

customer_feature_cols = [
    'secuityno', 'cltsex_M', 'marryd_S', 'marryd_U', 'marryd_W', 'marryd_M', 'marryd_P', 'marryd_D',
    'Cpct_AG01_lt20', 'Cpct_AG02_20to24', 'Cpct_AG03_25to29', 'Cpct_AG04_30to34',
    'Cpct_AG05_35to39', 'Cpct_AG06_40to44', 'Cpct_AG07_45to49',
    'Cpct_AG08_50to54', 'Cpct_AG09_55to59', 'Cpct_AG10_60up', 
    'economic_status', 'household_size', 'family_size',
    'inforce'
]

df_agent = df[agent_feature_cols].fillna(0)
df_agent = df_agent.drop_duplicates()
df_customer = df[customer_feature_cols].fillna(0)
df_customer = df_customer.drop_duplicates()

class SimpleTransformerEncoder(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=8, num_layers=2):
        super(SimpleTransformerEncoder, self).__init__()
        # Project input features to d_model dimension
        self.input_fc = nn.Linear(input_dim, d_model)
        # Create a stack of transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)  # ✅ FIXED
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # Optionally project to final embedding space (same dimension for simplicity)
        self.output_fc = nn.Linear(d_model, d_model)
    
    def forward(self, x):
        # x: (batch_size, input_dim)
        x = self.input_fc(x)        # (batch_size, d_model)
        # Transformer expects a sequence; here we add a sequence dimension of length 1:
        x = x.unsqueeze(1)          # (batch_size, 1, d_model) ✅ Use dim=1 because batch_first=True
        x = self.transformer_encoder(x)  # (batch_size, 1, d_model)
        x = x.squeeze(1)            # (batch_size, d_model)
        x = self.output_fc(x)       # (batch_size, d_model)
        return x
        
# Example dimensions for agent and customer features:
agent_input_dim = len(agent_feature_cols) - 1  # exclude the identifier column (e.g. 'agntnum')
customer_input_dim = len(customer_feature_cols) - 1  # exclude the identifier column (e.g. 'secuityno')

# Instantiate models
agent_model = SimpleTransformerEncoder(input_dim=agent_input_dim, d_model=64, nhead=8, num_layers=2)
customer_model = SimpleTransformerEncoder(input_dim=customer_input_dim, d_model=64, nhead=8, num_layers=2)

# Drop the identifier columns and convert to float tensors
agent_features = torch.tensor(df_agent.drop(columns=['agntnum']).values, dtype=torch.float32)
customer_features = torch.tensor(df_customer.drop(columns=['secuityno']).values, dtype=torch.float32)

# Get embeddings for all agents and customers
agent_embeddings = agent_model(agent_features)      # shape: (n_agents, 64)
customer_embeddings = customer_model(customer_features)  # shape: (n_customers, 64)
df_customer = df_customer.reset_index(drop=True)
# Define a function to recommend agents for a given customer based on cosine similarity in the learned embedding space.
def recommend_agents_transformer(customer_id, df_customer, df_agent, customer_embeddings, agent_embeddings, top_k=3):
    # Find index of the customer
    cust_idx = df_customer[df_customer['secuityno'] == customer_id].index[0]  # Adjusted to use reset index
    cust_embedding = customer_embeddings[cust_idx].unsqueeze(0)  # shape: (1, 64)
    
    # Compute cosine similarity between this customer and all agents
    cos_sim = F.cosine_similarity(cust_embedding, agent_embeddings)  # shape: (n_agents,)
    
    # Get the top_k indices with highest similarity
    top_indices = torch.topk(cos_sim, top_k).indices
    recommended_agents = df_agent.iloc[top_indices.numpy()][['agntnum']]
    return recommended_agents

# Example usage:
recommended_agents = recommend_agents_transformer(
    customer_id='CIN:2818', 
    df_customer=df_customer, 
    df_agent=df_agent, 
    customer_embeddings=customer_embeddings, 
    agent_embeddings=agent_embeddings, 
    top_k=3
)
print("Transformer-based Recommended Agents:")
print(recommended_agents)

In [None]:
## cosine_similarity_embedding_1.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path
import os

# Load the CSV file

df_client = pd.read_csv("../data/client_data.csv")
df = pd.read_csv("../data/encoded_data_with_id_names_1_tovector.csv")

agent_feature_cols = [
    'agntnum', 'pct_SX1_male', 'agent_marital_S', 'agent_marital_U', 'agent_marital_W', 'agent_marital_M', 'agent_marital_P', 'agent_marital_D',
    'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29', 'pct_AG04_30to34',
    'pct_AG05_35to39', 'pct_AG06_40to44', 'pct_AG07_45to49',
    'pct_AG08_50to54', 'pct_AG09_55to59', 'pct_AG10_60up',
    'economic_status_avg', 'household_size_avg', 'family_size_avg',
    'net_indicator'
]

customer_feature_cols = [
     'cltsex_M', 'marryd_S', 'marryd_U', 'marryd_W', 'marryd_M', 'marryd_P', 'marryd_D',
    'Cpct_AG01_lt20', 'Cpct_AG02_20to24', 'Cpct_AG03_25to29', 'Cpct_AG04_30to34',
    'Cpct_AG05_35to39', 'Cpct_AG06_40to44', 'Cpct_AG07_45to49',
    'Cpct_AG08_50to54', 'Cpct_AG09_55to59', 'Cpct_AG10_60up',
    'economic_status', 'household_size', 'family_size',
    'inforce'
] #Removed secuityno

df_agent = df[agent_feature_cols].fillna(0)
df_agent = df_agent.drop_duplicates()
#df_customer = df[customer_feature_cols].fillna(0)  # No longer needed
#df_customer = df_customer.drop_duplicates()


class SimpleTransformerEncoder(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=8, num_layers=2):
        super(SimpleTransformerEncoder, self).__init__()
        self.input_fc = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        x = self.input_fc(x)
        x = x.unsqueeze(1)
        x = self.transformer_encoder(x)
        x = x.squeeze(1)
        x = self.output_fc(x)
        return x

agent_input_dim = len(agent_feature_cols) - 1  # exclude the identifier column (e.g. 'agntnum')
customer_input_dim = len(customer_feature_cols)  #  Now, no identifier


# Instantiate models
agent_model = SimpleTransformerEncoder(input_dim=agent_input_dim, d_model=64, nhead=8, num_layers=2)
customer_model = SimpleTransformerEncoder(input_dim=customer_input_dim, d_model=64, nhead=8, num_layers=2)

# Drop the identifier columns and convert to float tensors
agent_features = torch.tensor(df_agent.drop(columns=['agntnum']).values, dtype=torch.float32)

# Get embeddings for all agents (customer embeddings will be generated on-the-fly)
agent_embeddings = agent_model(agent_features)      # shape: (n_agents, 64)


def calculate_age(born):
    """Calculates age based on birth date."""
    born = datetime.strptime(born, "%Y-%m-%d")
    today = datetime(2025, 2, 6)  # Fixed date
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))


def create_customer_features(customer_data):
    """Creates a feature vector for a customer from raw data."""
    features = {}

    # cltsex
    features['cltsex_M'] = 1 if customer_data['cltsex'] == 'M' else 0

    # marryd
    for status in ['S', 'U', 'W', 'M', 'P', 'D']:
        features[f'marryd_{status}'] = 1 if customer_data['marryd'] == status else 0

    # Age groups
    age = calculate_age(customer_data['cltdob'])
    age_groups = ['Cpct_AG01_lt20', 'Cpct_AG02_20to24', 'Cpct_AG03_25to29', 'Cpct_AG04_30to34',
                  'Cpct_AG05_35to39', 'Cpct_AG06_40to44', 'Cpct_AG07_45to49',
                  'Cpct_AG08_50to54', 'Cpct_AG09_55to59', 'Cpct_AG10_60up']
    for group in age_groups:
        features[group] = 0
    if age < 20:
        features['Cpct_AG01_lt20'] = 1
    elif 20 <= age <= 24:
        features['Cpct_AG02_20to24'] = 1
    elif 25 <= age <= 29:
      features['Cpct_AG03_25to29'] = 1
    elif 30 <= age <= 34:
      features['Cpct_AG04_30to34'] = 1
    elif 35 <= age <= 39:
        features['Cpct_AG05_35to39'] = 1
    elif 40 <= age <= 44:
        features['Cpct_AG06_40to44'] = 1
    elif 45 <= age <= 49:
      features['Cpct_AG07_45to49'] = 1
    elif 50 <= age <= 54:
      features['Cpct_AG08_50to54'] = 1
    elif 55 <= age <= 59:
        features['Cpct_AG09_55to59'] = 1
    elif age >= 60:
        features['Cpct_AG10_60up'] = 1

    # Other features
    features['economic_status'] = customer_data.get('economic_status', 0)  # Use .get() for safety
    features['household_size'] = customer_data.get('household_size', 0)
    features['family_size'] = customer_data.get('family_size', 0)
    features['inforce'] = customer_data.get('inforce', 0)


    # Convert to DataFrame and then to tensor
    features_df = pd.DataFrame([features])
    features_tensor = torch.tensor(features_df.values, dtype=torch.float32)
    return features_tensor

def recommend_agents_transformer(customer_data, df_agent, agent_embeddings, customer_model, top_k=3):
    """Recommends agents for a given customer based on their data."""

    # Create customer embedding
    customer_features = create_customer_features(customer_data)
    customer_embedding = customer_model(customer_features)  # (1, 64)

    # Compute cosine similarity
    cos_sim = F.cosine_similarity(customer_embedding, agent_embeddings)  # (n_agents,)

    # Get top_k indices
    top_indices = torch.topk(cos_sim, top_k).indices
    recommended_agents = df_agent.iloc[top_indices.numpy()][['agntnum']]
    return recommended_agents



# Example usage with a dictionary:
customer_data = {
    'cltsex': 'M',
    'cltdob': '1990-05-15',
    'marryd': 'M',
    'economic_status': 5,
    'household_size': 3,
    'family_size': 2,
    'inforce': 1  # Example value, adjust as needed
}

recommended_agents = recommend_agents_transformer(
    customer_data=customer_data,
    df_agent=df_agent,
    agent_embeddings=agent_embeddings,
    customer_model=customer_model,
    top_k=3
)
print("Transformer-based Recommended Agents:")
print(recommended_agents)

In [None]:
## cosine_similarity_training.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# =============================================================================
# Data Loading and Preprocessing (your provided code)
# =============================================================================

from pathlib import Path
import os

# Load the CSV file
df = pd.read_csv("../data/encoded_data_with_id_names_1_tovector.csv")
df_filter = pd.read_csv("../data/positive_examples.csv")

agent_feature_cols = [
    'agntnum', 'pct_SX1_male', 'agent_marital_S', 'agent_marital_U', 'agent_marital_W',
    'agent_marital_M', 'agent_marital_P', 'agent_marital_D',
    'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29', 'pct_AG04_30to34',
    'pct_AG05_35to39', 'pct_AG06_40to44', 'pct_AG07_45to49',
    'pct_AG08_50to54', 'pct_AG09_55to59', 'pct_AG10_60up', 
    'economic_status_avg', 'household_size_avg', 'family_size_avg',
    'net_indicator'
]

customer_feature_cols = [
    'secuityno', 'cltsex_M', 'marryd_S', 'marryd_U', 'marryd_W', 'marryd_M',
    'marryd_P', 'marryd_D',
    'Cpct_AG01_lt20', 'Cpct_AG02_20to24', 'Cpct_AG03_25to29', 'Cpct_AG04_30to34',
    'Cpct_AG05_35to39', 'Cpct_AG06_40to44', 'Cpct_AG07_45to49',
    'Cpct_AG08_50to54', 'Cpct_AG09_55to59', 'Cpct_AG10_60up', 
    'economic_status', 'household_size', 'family_size',
    'inforce'
]

# Fill missing values and remove duplicates
df_agent = df[agent_feature_cols].fillna(0).drop_duplicates()
df_customer = df[customer_feature_cols].fillna(0).drop_duplicates()


# =============================================================================
# Transformer Model Definition (your provided code with batch_first fixed)
# =============================================================================
class SimpleTransformerEncoder(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=8, num_layers=2):
        super(SimpleTransformerEncoder, self).__init__()
        # Project input features to d_model dimension
        self.input_fc = nn.Linear(input_dim, d_model)
        # Create a stack of transformer encoder layers; using batch_first=True for convenience
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # Project to final embedding space
        self.output_fc = nn.Linear(d_model, d_model)
    
    def forward(self, x):
        # x: (batch_size, input_dim)
        x = self.input_fc(x)        # (batch_size, d_model)
        x = x.unsqueeze(1)          # (batch_size, 1, d_model)
        x = self.transformer_encoder(x)  # (batch_size, 1, d_model)
        x = x.squeeze(1)            # (batch_size, d_model)
        x = self.output_fc(x)       # (batch_size, d_model)
        return x

# Set dimensions (exclude identifier columns)
agent_input_dim = len(agent_feature_cols) - 1  # excluding 'agntnum'
customer_input_dim = len(customer_feature_cols) - 1  # excluding 'secuityno'

# Instantiate models
agent_model = SimpleTransformerEncoder(input_dim=agent_input_dim, d_model=64, nhead=8, num_layers=2)
customer_model = SimpleTransformerEncoder(input_dim=customer_input_dim, d_model=64, nhead=8, num_layers=2)

# =============================================================================
# Preparing Dummy Conversion Data for Training
# =============================================================================
# In practice, you should use your historical conversion data (positive pairs).
# Here, we simulate a training batch by randomly sampling pairs from the customer and agent data.
# Note: This is for demonstration only.

df_filter = df_filter.sort_values(by='cosine_sim', ascending=False).reset_index(drop=True)
cutoff_index = int(0.3 * len(df_filter))  # 20% of total rows
df_top_20pct = df_filter.iloc[:cutoff_index].copy()

# Extract just the agent columns from the top-20% subset
train_agent_df = df_top_20pct[agent_feature_cols].copy()

# Extract just the customer columns from the top-20% subset
train_customer_df = df_top_20pct[customer_feature_cols].copy()

# Convert the features (drop identifier columns) to tensors
train_customer_tensor = torch.tensor(train_customer_df.drop(columns=['secuityno']).values, dtype=torch.float32)
train_agent_tensor = torch.tensor(train_agent_df.drop(columns=['agntnum']).values, dtype=torch.float32)


# =============================================================================
# Contrastive Loss (InfoNCE) Definition
# =============================================================================
def contrastive_loss(customer_embeddings, agent_embeddings, temperature=0.07):
    """
    Computes the InfoNCE loss between customer and agent embeddings.
    Assumes that the i-th customer in the batch is paired with the i-th agent as the positive pair.
    All other pairs in the batch are treated as negatives.
    """
    # Compute cosine similarity matrix between customer and agent embeddings: shape (N, N)
    similarity_matrix = F.cosine_similarity(customer_embeddings.unsqueeze(1), agent_embeddings.unsqueeze(0), dim=-1)
    # Scale the similarity scores by the temperature
    similarity_matrix = similarity_matrix / temperature
    
    # For each customer, the correct (positive) agent is at the diagonal (i.e., same index).
    targets = torch.arange(similarity_matrix.size(0)).to(similarity_matrix.device)
    
    # Compute cross-entropy loss in both directions for symmetry:
    loss_cust_to_agent = F.cross_entropy(similarity_matrix, targets)
    loss_agent_to_cust = F.cross_entropy(similarity_matrix.T, targets)
    
    loss = (loss_cust_to_agent + loss_agent_to_cust) / 2
    return loss

# =============================================================================
# Training and Fine-Tuning Loop
# =============================================================================
temperature = 0.07
optimizer = optim.Adam(list(agent_model.parameters()) + list(customer_model.parameters()), lr=1e-3)
num_epochs = 10

for epoch in range(num_epochs):
    # In a real training scenario, you would iterate over a DataLoader with many batches.
    # Here, we use our dummy batch.
    agent_model.train()
    customer_model.train()
    
    # Compute embeddings for the current batch
    customer_embeds = customer_model(train_customer_tensor)  # shape: (batch_size, d_model)
    agent_embeds = agent_model(train_agent_tensor)          # shape: (batch_size, d_model)
    
    # Compute the contrastive loss
    loss = contrastive_loss(customer_embeds, agent_embeds, temperature)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

torch.save({
    'epoch': num_epochs,
    'agent_model_state_dict': agent_model.state_dict(),
    'customer_model_state_dict': customer_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, 'checkpoint.pth')

# =============================================================================
# After Training: Use the Fine-Tuned Models for Recommendation
# =============================================================================
# Get embeddings for all agents and customers using the fine-tuned models.
# (Convert the corresponding DataFrames to tensors as before.)

agent_features = torch.tensor(df_agent.drop(columns=['agntnum']).values, dtype=torch.float32)
customer_features = torch.tensor(df_customer.drop(columns=['secuityno']).values, dtype=torch.float32)

agent_embeddings = agent_model(agent_features)      # shape: (n_agents, d_model)
customer_embeddings = customer_model(customer_features)  # shape: (n_customers, d_model)

# Define the recommendation function using the fine-tuned embeddings.
def recommend_agents_transformer(customer_id, df_customer, df_agent, customer_embeddings, agent_embeddings, top_k=3):
    # Find the index of the given customer in df_customer.
    cust_idx = df_customer.index[df_customer['secuityno'] == customer_id].tolist()[0]
    cust_embedding = customer_embeddings[cust_idx].unsqueeze(0)  # shape: (1, d_model)
    
    # Compute cosine similarity between this customer's embedding and all agent embeddings.
    cos_sim = F.cosine_similarity(cust_embedding, agent_embeddings)  # shape: (n_agents,)
    
    # Get the top_k indices with the highest similarity scores.
    top_indices = torch.topk(cos_sim, top_k).indices
    recommended_agents = df_agent.iloc[top_indices.numpy()][['agntnum']]
    return recommended_agents

# Example usage of the fine-tuned model:
recommended_agents = recommend_agents_transformer(
    customer_id='CIN:2161', 
    df_customer=df_customer, 
    df_agent=df_agent, 
    customer_embeddings=customer_embeddings, 
    agent_embeddings=agent_embeddings, 
    top_k=3
)
print("Transformer-based Recommended Agents (Fine-Tuned):")
print(recommended_agents)

In [None]:
#evaluation.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from pathlib import Path
import os

# 1. Load data

df = pd.read_csv("../data/encoded_data_with_id_names_1_tovector.csv")

agent_feature_cols = [
    'agntnum', 'pct_SX1_male', 'agent_marital_S', 'agent_marital_U', 'agent_marital_W',
    'agent_marital_M', 'agent_marital_P', 'agent_marital_D',
    'pct_AG01_lt20', 'pct_AG02_20to24', 'pct_AG03_25to29', 'pct_AG04_30to34',
    'pct_AG05_35to39', 'pct_AG06_40to44', 'pct_AG07_45to49',
    'pct_AG08_50to54', 'pct_AG09_55to59', 'pct_AG10_60up',
    'economic_status_avg', 'household_size_avg', 'family_size_avg',
    'net_indicator'
]

customer_feature_cols = [
    'secuityno', 'cltsex_M', 'marryd_S', 'marryd_U', 'marryd_W', 'marryd_M',
    'marryd_P', 'marryd_D',
    'Cpct_AG01_lt20', 'Cpct_AG02_20to24', 'Cpct_AG03_25to29', 'Cpct_AG04_30to34',
    'Cpct_AG05_35to39', 'Cpct_AG06_40to44', 'Cpct_AG07_45to49',
    'Cpct_AG08_50to54', 'Cpct_AG09_55to59', 'Cpct_AG10_60up',
    'economic_status', 'household_size', 'family_size',
    'inforce'
]

# --- CRITICAL FIX: Reset index after dropping duplicates ---
df_agent = df[agent_feature_cols].fillna(0).drop_duplicates().reset_index(drop=True)
df_customer = df[customer_feature_cols].fillna(0).drop_duplicates().reset_index(drop=True)


# 2. Define model architecture (Same as before)
class SimpleTransformerEncoder(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=8, num_layers=2):
        super(SimpleTransformerEncoder, self).__init__()
        self.input_fc = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        x = self.input_fc(x)
        x = x.unsqueeze(1)
        x = self.transformer_encoder(x)
        x = x.squeeze(1)
        x = self.output_fc(x)
        return x

agent_input_dim = len(agent_feature_cols) - 1
customer_input_dim = len(customer_feature_cols) - 1

agent_model = SimpleTransformerEncoder(agent_input_dim, 64, 8, 2)
customer_model = SimpleTransformerEncoder(customer_input_dim, 64, 8, 2)

# 3. Load checkpoint (Same as before)
checkpoint = torch.load("../src/checkpoint.pth")
agent_model.load_state_dict(checkpoint['agent_model_state_dict'])
customer_model.load_state_dict(checkpoint['customer_model_state_dict'])
print(f"Loaded model from epoch {checkpoint['epoch']}")

# 4. Create embeddings (Same as before)
agent_features = torch.tensor(df_agent.drop(columns=['agntnum']).values, dtype=torch.float32)
customer_features = torch.tensor(df_customer.drop(columns=['secuityno']).values, dtype=torch.float32)
agent_model.eval()
customer_model.eval()
with torch.no_grad():
    agent_embeddings = agent_model(agent_features)
    customer_embeddings = customer_model(customer_features)

# 5. Recommendation function (Same as before)
def recommend_agents_transformer(customer_id, df_customer, df_agent, customer_embeddings, agent_embeddings, top_k=3):
    cust_idx_list = df_customer.index[df_customer['secuityno'] == customer_id].tolist()
    if not cust_idx_list:
        print(f"Warning: Customer {customer_id} not found in df_customer.")
        return None
    cust_idx = cust_idx_list[0]
    if cust_idx >= customer_embeddings.shape[0]:
        print(f"Error: Index {cust_idx} is out of bounds for customer_embeddings with size {customer_embeddings.shape[0]}")
        return None
    cust_embed = customer_embeddings[cust_idx].unsqueeze(0)
    cos_sim = F.cosine_similarity(cust_embed, agent_embeddings)
    top_idx = torch.topk(cos_sim, top_k).indices
    return df_agent.iloc[top_idx.numpy()][['agntnum']]

# 6. Evaluate with a simple NDCG (Same as before)
def ndcg_at_k(recommended_agents, ground_truth_agents, k=3):
    dcg = 0.0
    for i, agent in enumerate(recommended_agents[:k], start=1):
        rel = 1 if agent in ground_truth_agents else 0
        dcg += rel / np.log2(i + 1)
    ideal_count = min(len(ground_truth_agents), k)
    idcg = sum(1 / np.log2(i + 1) for i in range(1, ideal_count + 1))
    return 0 if idcg == 0 else dcg / idcg

# 7. compare_recommendations_with_ground_truth (Corrected Version)
def compare_recommendations_with_ground_truth(
    customer_id,
    recommended_agent_ids,
    ground_truth_agent_ids,
    df_customer,
    df_agent,
    customer_embeddings,
    agent_embeddings
):
    cust_idx_list = df_customer.index[df_customer['secuityno'] == customer_id].tolist()
    if not cust_idx_list:
        return {"error": f"Customer {customer_id} not found in df_customer"}
    cust_idx = cust_idx_list[0]
    customer_embed = customer_embeddings[cust_idx].unsqueeze(0)

    recommended_sims = []
    for agent_id in recommended_agent_ids:
        agent_idx_list = df_agent.index[df_agent['agntnum'] == agent_id].tolist()
        if not agent_idx_list:
            continue
        agent_idx = agent_idx_list[0]
        agent_embed = agent_embeddings[agent_idx].unsqueeze(0)
        sim_val = F.cosine_similarity(customer_embed, agent_embed).item()
        recommended_sims.append((agent_id, sim_val))

    ground_truth_sims = []
    for agent_id in ground_truth_agent_ids:
        agent_idx_list = df_agent.index[df_agent['agntnum'] == agent_id].tolist()
        if not agent_idx_list:
            continue
        agent_idx = agent_idx_list[0]
        agent_embed = agent_embeddings[agent_idx].unsqueeze(0)
        sim_val = F.cosine_similarity(customer_embed, agent_embed).item()
        ground_truth_sims.append((agent_id, sim_val))

    if recommended_sims:
        max_recommended_sim = max(s for _, s in recommended_sims)
    else:
        max_recommended_sim = 0.0

    if ground_truth_sims:
        max_ground_truth_sim = max(s for _, s in ground_truth_sims)
    else:
        max_ground_truth_sim = 0.0

    is_better = (max_recommended_sim > max_ground_truth_sim)
    matched_any_gt = any(agent_id in ground_truth_agent_ids for agent_id, _ in recommended_sims)

    return {
        "recommended_sims": recommended_sims,
        "ground_truth_sims": ground_truth_sims,
        "max_recommended_sim": max_recommended_sim,
        "max_ground_truth_sim": max_ground_truth_sim,
        "is_better_than_gt": is_better,
        "matched_any_gt": matched_any_gt
    }
#8 Evaluation Loop
df_filter = pd.read_csv("../data/positive_examples.csv")
df_filter = df_filter.sort_values(by='cosine_sim', ascending=False).reset_index(drop=True)
total_rows = len(df_filter)
start_index = int(0.3 * total_rows)
end_index = int(0.4 * total_rows)
df_top_30_40pct = df_filter.iloc[start_index:end_index].copy()
df_top_30_40pct['agntnum'] = df_top_30_40pct['agntnum'].str.extract(r'(\d+)').astype(int)
ground_truth_mapping = df_top_30_40pct.groupby('secuityno')['agntnum'].apply(lambda x: set(x)).to_dict()
test_customers = list(ground_truth_mapping.keys())
ndcg_scores = []
k = 3

for cust_id in test_customers:
    recommended_df = recommend_agents_transformer(
        cust_id, df_customer, df_agent, customer_embeddings, agent_embeddings, top_k=k
    )
    if recommended_df is None:
        continue
    recommended_agents = recommended_df['agntnum'].tolist()
    ground_truth_agents = ground_truth_mapping[cust_id]
    ndcg_score = ndcg_at_k(recommended_agents, ground_truth_agents, k)
    ndcg_scores.append(ndcg_score)
    cmp_result = compare_recommendations_with_ground_truth(
        customer_id=cust_id,
        recommended_agent_ids=recommended_agents,
        ground_truth_agent_ids=ground_truth_agents,
        df_customer=df_customer,
        df_agent=df_agent,
        customer_embeddings=customer_embeddings,
        agent_embeddings=agent_embeddings
    )
    print(f"Customer {cust_id} -> recommended: {recommended_agents}, ground_truth: {ground_truth_agents}")
    print(f"  NDCG@{k} = {ndcg_score:.3f}")
    if "error" not in cmp_result:
        print(f"  Max recommended similarity: {cmp_result['max_recommended_sim']:.4f}")
        print(f"  Max ground truth similarity: {cmp_result['max_ground_truth_sim']:.4f}")
        print(f"  Is recommended better than ground truth? {cmp_result['is_better_than_gt']}")
        print(f"  Overlap with GT agents? {cmp_result['matched_any_gt']}")
    else:
        print(f"  Error: {cmp_result['error']}")
    print("-" * 50)

if ndcg_scores:
    mean_ndcg = np.mean(ndcg_scores)
    print(f"\nMean NDCG@{k}: {mean_ndcg:.3f}")