In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.losses import CosineSimilarity
import os
import random


In [100]:
# Fix all random seeds to ensure consistent results
SEED = 42
# os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [101]:
# The field weights (total = 1.0)
feature_weights = {
        "Language": 0.25,
        "Google rating": 0.20,
        "Success rate": 0.15,
        "Charge": 0.10,
        "Visa type": 0.10,
        "Experience_years": 0.05,
        "Booking preference": 0.05,
        "Location": 0.03,
        "Availability": 0.03,
        "Employment Type": 0.04
    }

In [108]:
# Load and preprocess data
df = pd.read_csv("../Dataset/requirements_data_5.6.csv")
names_df = df[['Full_name', 'MARN']]
names_df[["Full_name", "MARN"]].to_csv("./agent_ids.csv", index=False)
df__cc = df.copy()
df = df.drop(columns=['Full_name', 'MARN'], errors='ignore').dropna()

names_df[["Full_name", "MARN"]].to_csv("model/agent_ids.csv", index=False)

categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split data
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
input_dim = df.shape[1]

# Create  training data
def build_triplet_data(data, num_triplets=5000):
    anchors, positives, negatives = [], [], []

    feature_order = list(data.columns)
    weight_vector = np.array([feature_weights.get(col, 1.0) for col in feature_order], dtype=np.float32)

    for _ in range(num_triplets):
        a, p = np.random.choice(len(data), 2, replace=False)
        n = np.random.choice([i for i in range(len(data)) if i != a])

        anchor_vec = data.iloc[a].values.astype(np.float32) * weight_vector
        pos_vec = data.iloc[p].values.astype(np.float32) * weight_vector
        neg_vec = data.iloc[n].values.astype(np.float32) * weight_vector

        anchors.append(anchor_vec)
        positives.append(pos_vec)
        negatives.append(neg_vec)

    return np.array(anchors), np.array(positives), np.array(negatives)

X_anchor, X_positive, X_negative = build_triplet_data(train_df, num_triplets=5000)

# save training data
import os
os.makedirs("train", exist_ok=True)

pd.DataFrame(X_anchor, columns=df.columns).to_csv("train/train_anchor_samples.csv", index=False)
pd.DataFrame(X_positive, columns=df.columns).to_csv("train/train_positive_samples.csv", index=False)
pd.DataFrame(X_negative, columns=df.columns).to_csv("train/train_negative_samples.csv", index=False)

class L2Normalization(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.math.l2_normalize(inputs, axis=1)


# Define embedding tower 
def build_tower(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(inp)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dense(32)(x)
    x = L2Normalization()(x)
    return inp, x


anchor_in, anchor_vec = build_tower(input_dim)
pos_in, pos_vec = build_tower(input_dim)
neg_in, neg_vec = build_tower(input_dim)



# Triplet loss 
def triplet_loss(y_true, y_pred):
    a, p, n = y_pred[:, :32], y_pred[:, 32:64], y_pred[:, 64:]
    pos_dist = tf.reduce_sum(tf.square(a - p), axis=1)
    neg_dist = tf.reduce_sum(tf.square(a - n), axis=1)
    return tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 0.2, 0))

merged = layers.Concatenate()([anchor_vec, pos_vec, neg_vec])
model = Model(inputs=[anchor_in, pos_in, neg_in], outputs=merged)
model.compile(optimizer='adam', loss=triplet_loss)

# Train model
model.fit([X_anchor, X_positive, X_negative], np.zeros(len(X_anchor)), epochs=10, batch_size=64, verbose=1)

# Export tower model and generate agent embeddings 
tower_model = Model(inputs=anchor_in, outputs=anchor_vec)
agent_embeddings = tower_model.predict(df.to_numpy())



OSError: Cannot save file into a non-existent directory: 'model'

In [103]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,classification_report, confusion_matrix


#  save testing data
os.makedirs("test", exist_ok=True)
pd.DataFrame(X_anchor, columns=df.columns).to_csv("test/test_anchor_samples.csv", index=False)
pd.DataFrame(X_positive, columns=df.columns).to_csv("test/test_positive_samples.csv", index=False)
pd.DataFrame(X_negative, columns=df.columns).to_csv("test/test_negative_samples.csv", index=False)


# The model outputs the feature vector and calculates the similarity
y_pred_triplet = model.predict([X_anchor, X_positive, X_negative])
a_vec, p_vec, n_vec = y_pred_triplet[:, :32], y_pred_triplet[:, 32:64], y_pred_triplet[:, 64:]


cos_sim_ap = np.sum(a_vec * p_vec, axis=1)
cos_sim_an = np.sum(a_vec * n_vec, axis=1)

# Construct labels and scores
y_true = np.concatenate([np.ones_like(cos_sim_ap), np.zeros_like(cos_sim_an)])
y_scores = np.concatenate([cos_sim_ap, cos_sim_an])
y_pred_binary = (y_scores > 0.5).astype(int)

# Output evaluation indicators
acc = accuracy_score(y_true, y_pred_binary)
prec = precision_score(y_true, y_pred_binary)
rec = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)
auc = roc_auc_score(y_true, y_scores)
cm = confusion_matrix(y_true, y_pred_binary)

print("\n=== score ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
# print(f"AUC:       {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")
print("\nClassification Report:\n", classification_report(y_true, y_pred_binary))



[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 582us/step

=== score ===
Accuracy:  0.9788
Precision: 1.0000
Recall:    0.9576
F1 Score:  0.9783
Confusion Matrix:
[[5000    0]
 [ 212 4788]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      5000
         1.0       1.00      0.96      0.98      5000

    accuracy                           0.98     10000
   macro avg       0.98      0.98      0.98     10000
weighted avg       0.98      0.98      0.98     10000



In [104]:
def recommend_with_weighted_match(user_input_dict, top_k, agent_df, label_encoders, feature_weights):
    explanations = []

    for _, agent_data in agent_df.iterrows():
        explanation = {
            "Full_name": agent_data["Full_name"],
            "MARN": agent_data["MARN"],
            "Matched Fields": [],
            "Match Score": 0.0,
        }
        match_weight = 0.0

        for field, user_val in user_input_dict.items():
            if field not in feature_weights:
                continue
            field_weight = feature_weights[field]
            agent_val = agent_data[field]

            # If it is a categorical field
            if field in label_encoders:
                le = label_encoders[field]
                if user_val in le.classes_:
                    user_encoded = le.transform([user_val])[0]
                    agent_encoded = agent_val
                    if user_encoded == agent_encoded:
                        explanation["Matched Fields"].append(field)
                        match_weight += field_weight
            else:
                # Numeric field: Use isclose
                if isinstance(agent_val, (int, float, np.number)) and isinstance(user_val, (int, float, np.number)):
                    if np.isclose(agent_val, user_val):
                        explanation["Matched Fields"].append(field)
                        match_weight += field_weight

        explanation["Match Score"] = round(match_weight, 4)
        explanations.append(explanation)

    # Sort by the matching score
    sorted_explanations = sorted(explanations, key=lambda x: x["Match Score"], reverse=True)
    return sorted_explanations[:top_k]


In [105]:
def explain_recommendation(user_input_dict, top_agents_df, label_encoders, df_raw, top_k=3):
    explanations = []

    for i in range(top_k):
        agent_row = top_agents_df.iloc[i]
        agent_index = df_raw[df_raw["MARN"] == agent_row["MARN"]].index[0]
        agent_data = df_raw.loc[agent_index]
        
        matched_fields = []
        for field, user_val in user_input_dict.items():
            if field in label_encoders:
                le = label_encoders[field]
                if user_val in le.classes_:
                    user_encoded = le.transform([user_val])[0]
                    agent_encoded =  agent_data[field] 
                    if user_encoded == agent_encoded:
                        matched_fields.append(field)
            else:
                # Numeric fields compare whether they are equal
                try:
                    user_val = float(user_val)
                    agent_val = float(agent_data[field])
                    if abs(user_val - agent_val) < 1e-3:
                        matched_fields.append(field)
                except:
                    continue
        
        explanation = {
            "Full_name": agent_row["Full_name"],
            "MARN": agent_row["MARN"],
            "Match Score": agent_row["Match Score"],
            "Matched Fields": matched_fields
        }
        explanations.append(explanation)

    return explanations


In [106]:
# df.columns

In [107]:
# 假设你训练好的模型变量名是 model
# from tensorflow.keras.models import save_model
tower_model.save("towers1111_model.keras")


