In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.losses import CosineSimilarity
import os
import random

In [2]:
# Fix all random seeds to ensure consistent results
SEED = 42
# os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [3]:
# The field weights (total = 1.0) -- lauguage first
feature_weights_1 = {
        "Language": 0.25,
        "Google rating": 0.20,
        "Success rate": 0.15,
        "Charge": 0.10,
        "Visa type": 0.10,
        "Experience_years": 0.05,
        "Booking preference": 0.05,
        "Location": 0.03,
        "Availability": 0.03,
        "Employment Type": 0.04
    }

In [4]:
# The field weights (total = 1.0) --- price-sensitive users
feature_weights_2 = {
    "Charge": 0.25,
    "Success rate": 0.20,
    "Google rating": 0.15,
    "Language": 0.15,
    "Visa type": 0.08,
    "Experience_years": 0.05,
    "Booking preference": 0.05,
    "Employment Type": 0.03,
    "Location": 0.02,
    "Availability": 0.02,
    }

In [5]:
# The field weights (total = 1.0) -- Handle demands quickly
feature_weights_3 = {
    "Availability": 0.25,
    "Success rate": 0.20,
    "Google rating": 0.15,
    "Language": 0.10,
    "Visa type": 0.10,
    "Experience_years": 0.05,
    "Booking preference": 0.05,
    "Charge": 0.05,
    "Employment Type": 0.03,
    "Location": 0.02,
    }

In [6]:
# The field weights (total = 1.0) -- rating
feature_weights_4 = {
    "Google rating": 0.30,
    "Success rate": 0.25,
    "Language": 0.15,
    "Visa type": 0.10,
    "Charge": 0.05,
    "Experience_years": 0.05,
    "Booking preference": 0.03,
    "Employment Type": 0.03,
    "Location": 0.02,
    "Availability": 0.02,
    }

In [7]:
# The field weights (total = 1.0)-- more balance
feature_weights_5 = {
    "Language": 0.15,
    "Google rating": 0.15,
    "Success rate": 0.15,
    "Charge": 0.10,
    "Visa type": 0.10,
    "Experience_years": 0.10,
    "Booking preference": 0.08,
    "Employment Type": 0.07,
    "Location": 0.05,
    "Availability": 0.05,
    }

In [8]:
# Load and preprocess data
df = pd.read_csv("requirements_data_5.6.csv")
names_df = df[['Full_name', 'MARN']]
df = df.drop(columns=['Full_name', 'MARN'], errors='ignore').dropna()

categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split data

train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)
input_dim = df.shape[1]

# Create  training data
def build_triplet_data(data, num_triplets=5000):
    anchors, positives, negatives = [], [], []

    feature_order = list(data.columns)
    weight_vector = np.array([feature_weights_5.get(col, 1.0) for col in feature_order], dtype=np.float32)

    for _ in range(num_triplets):
        a, p = np.random.choice(len(data), 2, replace=False)
        n = np.random.choice([i for i in range(len(data)) if i != a])

        anchor_vec = data.iloc[a].values.astype(np.float32)  * weight_vector
        pos_vec = data.iloc[p].values.astype(np.float32)  * weight_vector
        neg_vec = data.iloc[n].values.astype(np.float32) * weight_vector

        anchors.append(anchor_vec)
        positives.append(pos_vec)
        negatives.append(neg_vec)

    return np.array(anchors), np.array(positives), np.array(negatives)

X_anchor, X_positive, X_negative = build_triplet_data(train_df, num_triplets=5000)

# save training data
import os
os.makedirs("train", exist_ok=True)

pd.DataFrame(X_anchor, columns=df.columns).to_csv("train/train_anchor_samples.csv", index=False)
pd.DataFrame(X_positive, columns=df.columns).to_csv("train/train_positive_samples.csv", index=False)
pd.DataFrame(X_negative, columns=df.columns).to_csv("train/train_negative_samples.csv", index=False)


# Define embedding tower 
def build_tower(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(inp)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dense(32)(x)
    x = layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x)

    return inp, x

anchor_in, anchor_vec = build_tower(input_dim)
pos_in, pos_vec = build_tower(input_dim)
neg_in, neg_vec = build_tower(input_dim)



# Triplet loss 
def triplet_loss(y_true, y_pred):
    a, p, n = y_pred[:, :32], y_pred[:, 32:64], y_pred[:, 64:]
    pos_dist = tf.reduce_sum(tf.square(a - p), axis=1)
    neg_dist = tf.reduce_sum(tf.square(a - n), axis=1)
    return tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 0.2, 0))

merged = layers.Concatenate()([anchor_vec, pos_vec, neg_vec])
model = Model(inputs=[anchor_in, pos_in, neg_in], outputs=merged)
model.compile(optimizer='adam', loss=triplet_loss)

# Train model
model.fit([X_anchor, X_positive, X_negative], np.zeros(len(X_anchor)), epochs=10, batch_size=64, verbose=1)

# Export tower model and generate agent embeddings 
tower_model = Model(inputs=anchor_in, outputs=anchor_vec)
agent_embeddings = tower_model.predict(df.to_numpy())



Epoch 1/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 658us/step - loss: 0.0223 
Epoch 2/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 744us/step - loss: 0.0000e+00
Epoch 3/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 769us/step - loss: 0.0000e+00
Epoch 4/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 760us/step - loss: 0.0000e+00
Epoch 5/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0000e+00
Epoch 6/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - loss: 0.0000e+00
Epoch 7/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 744us/step - loss: 0.0000e+00
Epoch 8/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 716us/step - loss: 0.0000e+00
Epoch 9/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 709us/step - loss: 0.0000e+00
Epoch 10/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,classification_report, confusion_matrix
from datetime import datetime

#  save testing data
os.makedirs("test", exist_ok=True)
pd.DataFrame(X_anchor, columns=df.columns).to_csv("test/test_anchor_samples.csv", index=False)
pd.DataFrame(X_positive, columns=df.columns).to_csv("test/test_positive_samples.csv", index=False)
pd.DataFrame(X_negative, columns=df.columns).to_csv("test/test_negative_samples.csv", index=False)


# The model outputs the feature vector and calculates the similarity
y_pred_triplet = model.predict([X_anchor, X_positive, X_negative])
a_vec, p_vec, n_vec = y_pred_triplet[:, :32], y_pred_triplet[:, 32:64], y_pred_triplet[:, 64:]


cos_sim_ap = np.sum(a_vec * p_vec, axis=1)
cos_sim_an = np.sum(a_vec * n_vec, axis=1)

# Construct labels and scores
y_true = np.concatenate([np.ones_like(cos_sim_ap), np.zeros_like(cos_sim_an)])
y_scores = np.concatenate([cos_sim_ap, cos_sim_an])
y_pred_binary = (y_scores > 0.5).astype(int)

# Output evaluation indicators
acc = accuracy_score(y_true, y_pred_binary)
prec = precision_score(y_true, y_pred_binary)
rec = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)
auc = roc_auc_score(y_true, y_scores)
cm = confusion_matrix(y_true, y_pred_binary)

print("\n=== score ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
# print(f"AUC:       {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")
print("\nClassification Report:\n", classification_report(y_true, y_pred_binary))



now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
remark = "权重组合5"  


record = {
    "time": now,
    "accuracy": acc,
    "precision": prec,
    "recall": rec,
    "f1_score": f1,
    "remark": remark,
    "training": "60%",
    "testing": "40%",
    "weight" : "feature_weights_5"
}


csv_file = "evaluation_log.csv"


try:
    df = pd.read_csv(csv_file)
    df = pd.concat([df, pd.DataFrame([record])], ignore_index=True)
except FileNotFoundError:
    df = pd.DataFrame([record])


df.to_csv(csv_file, index=False)

print("saved evaluation_log.csv")




[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 484us/step

=== score ===
Accuracy:  0.9588
Precision: 1.0000
Recall:    0.9176
F1 Score:  0.9570
Confusion Matrix:
[[5000    0]
 [ 412 4588]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      1.00      0.96      5000
         1.0       1.00      0.92      0.96      5000

    accuracy                           0.96     10000
   macro avg       0.96      0.96      0.96     10000
weighted avg       0.96      0.96      0.96     10000

记录已保存到 evaluation_log.csv
