In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.losses import CosineSimilarity

In [2]:
# Load and preprocess data
df = pd.read_csv("requirements_data_5.6.csv")
names_df = df[['Full_name', 'MARN']]
df = df.drop(columns=['Full_name', 'MARN'], errors='ignore').dropna()

categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
input_dim = df.shape[1]

# Create triplet training data
def build_triplet_data(data, num_triplets=5000):
    anchors, positives, negatives = [], [], []
    for _ in range(num_triplets):
        a, p = np.random.choice(len(data), 2, replace=False)
        n = np.random.choice([i for i in range(len(data)) if i != a])
        anchors.append(data.iloc[a].values)
        positives.append(data.iloc[p].values)
        negatives.append(data.iloc[n].values)
    return np.array(anchors), np.array(positives), np.array(negatives)

X_anchor, X_positive, X_negative = build_triplet_data(train_df)


# save training data
import os
os.makedirs("train", exist_ok=True)

pd.DataFrame(X_anchor, columns=df.columns).to_csv("train/train_anchor_samples.csv", index=False)
pd.DataFrame(X_positive, columns=df.columns).to_csv("train/train_positive_samples.csv", index=False)
pd.DataFrame(X_negative, columns=df.columns).to_csv("train/train_negative_samples.csv", index=False)


# Define embedding tower 
def build_tower(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(inp)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dense(32)(x)
    x = layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x)

    return inp, x

anchor_in, anchor_vec = build_tower(input_dim)
pos_in, pos_vec = build_tower(input_dim)
neg_in, neg_vec = build_tower(input_dim)




# Triplet loss 
def triplet_loss(y_true, y_pred):
    a, p, n = y_pred[:, :32], y_pred[:, 32:64], y_pred[:, 64:]
    pos_dist = tf.reduce_sum(tf.square(a - p), axis=1)
    neg_dist = tf.reduce_sum(tf.square(a - n), axis=1)
    return tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 0.2, 0))

merged = layers.Concatenate()([anchor_vec, pos_vec, neg_vec])
model = Model(inputs=[anchor_in, pos_in, neg_in], outputs=merged)
model.compile(optimizer='adam', loss=triplet_loss)

# Train model
model.fit([X_anchor, X_positive, X_negative], np.zeros(len(X_anchor)), epochs=10, batch_size=64, verbose=1)

# Export tower model and generate agent embeddings 
tower_model = Model(inputs=anchor_in, outputs=anchor_vec)
agent_embeddings = tower_model.predict(df.to_numpy())




Epoch 1/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 674us/step - loss: 0.0163 
Epoch 2/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step - loss: 0.0000e+00
Epoch 3/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 758us/step - loss: 0.0000e+00
Epoch 4/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 710us/step - loss: 0.0000e+00
Epoch 5/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 750us/step - loss: 0.0000e+00
Epoch 6/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 766us/step - loss: 0.0000e+00
Epoch 7/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 752us/step - loss: 0.0000e+00
Epoch 8/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 755us/step - loss: 0.0000e+00
Epoch 9/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 741us/step - loss: 0.0000e+00
Epoch 10/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,classification_report, confusion_matrix

# Construct the evaluation set
def build_triplet_data(data, num_triplets=1000):
    anchors, positives, negatives = [], [], []
    for _ in range(num_triplets):
        a, p = np.random.choice(len(data), 2, replace=False)
        n = np.random.choice([i for i in range(len(data)) if i != a])
        anchors.append(data.iloc[a].values)
        positives.append(data.iloc[p].values)
        negatives.append(data.iloc[n].values)
    return np.array(anchors), np.array(positives), np.array(negatives)

X_anchor_test, X_pos_test, X_neg_test = build_triplet_data(test_df)


#  save testing data
os.makedirs("test", exist_ok=True)
pd.DataFrame(X_anchor_test, columns=df.columns).to_csv("test/test_anchor_samples.csv", index=False)
pd.DataFrame(X_pos_test, columns=df.columns).to_csv("test/test_positive_samples.csv", index=False)
pd.DataFrame(X_neg_test, columns=df.columns).to_csv("test/test_negative_samples.csv", index=False)


# The model outputs the feature vector and calculates the similarity
y_pred_triplet = model.predict([X_anchor_test, X_pos_test, X_neg_test])
a_vec, p_vec, n_vec = y_pred_triplet[:, :32], y_pred_triplet[:, 32:64], y_pred_triplet[:, 64:]


cos_sim_ap = np.sum(a_vec * p_vec, axis=1)
cos_sim_an = np.sum(a_vec * n_vec, axis=1)

# Construct labels and scores
y_true = np.concatenate([np.ones_like(cos_sim_ap), np.zeros_like(cos_sim_an)])
y_scores = np.concatenate([cos_sim_ap, cos_sim_an])
y_pred_binary = (y_scores > 0.5).astype(int)

# Output evaluation indicators
acc = accuracy_score(y_true, y_pred_binary)
prec = precision_score(y_true, y_pred_binary)
rec = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)
auc = roc_auc_score(y_true, y_scores)
cm = confusion_matrix(y_true, y_pred_binary)

print("\n=== score ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")
print("\nClassification Report:\n", classification_report(y_true, y_pred_binary))



[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

=== score ===
Accuracy:  0.9850
Precision: 1.0000
Recall:    0.9700
F1 Score:  0.9848
AUC:       1.0000
Confusion Matrix:
[[1000    0]
 [  30  970]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.99      1000
         1.0       1.00      0.97      0.98      1000

    accuracy                           0.98      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.98      0.98      2000



In [4]:
# Recommendation function 
def recommend(user_input_dict, top_k=3):
    user_df = df.iloc[:1].copy()
    for key, value in user_input_dict.items():
        if key in label_encoders:
            le = label_encoders[key]
            user_df[key] = le.transform([value])[0] if value in le.classes_ else 0
        else:
            user_df[key] = value
    user_df[numerical_cols] = scaler.transform(user_df[numerical_cols])
    user_vec = tower_model.predict(user_df.to_numpy())
    sims = tf.linalg.matmul(user_vec, agent_embeddings.T).numpy().flatten()
    top_k_idx = sims.argsort()[-top_k:][::-1]
    return names_df.iloc[top_k_idx].reset_index(drop=True)


In [5]:
df.columns

Index(['Experience_years', 'Charge', 'Visa type', 'Booking preference',
       'Location', 'Success rate', 'Language', 'Employment Type',
       'Google rating', 'Availability'],
      dtype='object')

In [6]:
# Example usage
example_input = {
    'Experience_years': 5,
    'Success rate': '80-90%',
    'Language': 'Chinese',
    'Booking preference': 'Online',
    'Charge':'201-500 AUD',
    'Visa type':'491',
    'Location':'VIC',
    'Employment Type':'Independent',
    'Google rating':'4.6',
    'Availability':'1 month'
}
top_agents = recommend(example_input)
print("Top 3 Recommended Agents:")
print(top_agents)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Top 3 Recommended Agents:
         Full_name     MARN
0  Sadhav Aggarwal  1805183
1   Rajaneesh Shah  1387046
2   Bhawana Thakur  1684280
