# Appendix

## 1.Baseline model

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

In [2]:
from scipy.io import arff
import pandas as pd

data, meta = arff.loadarff("speeddating.arff")
df = pd.DataFrame(data)

df

Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,race,race_o,samerace,...,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met,decision,decision_o,match
0,b'0',1.0,b'female',21.0,27.0,6.0,b'[4-6]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'0',b'0'
1,b'0',1.0,b'female',21.0,22.0,1.0,b'[0-1]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,5.0,b'[6-8]',b'[5-6]',1.0,b'1',b'0',b'0'
2,b'1',1.0,b'female',21.0,22.0,1.0,b'[0-1]',b'Asian/Pacific Islander/Asian-American',b'Asian/Pacific Islander/Asian-American',b'1',...,b'[0-3]',b'[3-5]',7.0,,b'[6-8]',b'[0-4]',1.0,b'1',b'1',b'1'
3,b'0',1.0,b'female',21.0,23.0,2.0,b'[2-3]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'1',b'1'
4,b'0',1.0,b'female',21.0,24.0,3.0,b'[2-3]',b'Asian/Pacific Islander/Asian-American',b'Latino/Hispanic American',b'0',...,b'[0-3]',b'[3-5]',6.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'1',b'1'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,b'1',21.0,b'male',25.0,26.0,1.0,b'[0-1]',b'European/Caucasian-American',b'Latino/Hispanic American',b'0',...,b'[0-3]',b'[3-5]',2.0,5.0,b'[0-5]',b'[5-6]',0.0,b'0',b'1',b'0'
8374,b'1',21.0,b'male',25.0,24.0,1.0,b'[0-1]',b'European/Caucasian-American',b'Other',b'0',...,b'[0-3]',b'[3-5]',4.0,4.0,b'[0-5]',b'[0-4]',0.0,b'0',b'0',b'0'
8375,b'1',21.0,b'male',25.0,29.0,4.0,b'[4-6]',b'European/Caucasian-American',b'Latino/Hispanic American',b'0',...,b'[0-3]',b'[3-5]',6.0,5.0,b'[6-8]',b'[5-6]',0.0,b'0',b'0',b'0'
8376,b'1',21.0,b'male',25.0,22.0,3.0,b'[2-3]',b'European/Caucasian-American',b'Asian/Pacific Islander/Asian-American',b'0',...,b'[0-3]',b'[3-5]',5.0,5.0,b'[0-5]',b'[5-6]',0.0,b'0',b'1',b'0'


In [3]:
print(meta.names())

['has_null', 'wave', 'gender', 'age', 'age_o', 'd_age', 'd_d_age', 'race', 'race_o', 'samerace', 'importance_same_race', 'importance_same_religion', 'd_importance_same_race', 'd_importance_same_religion', 'field', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'd_pref_o_attractive', 'd_pref_o_sincere', 'd_pref_o_intelligence', 'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o', 'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o', 'd_ambitous_o', 'd_shared_interests_o', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important', 'd_attractive_important', 'd_sincere_important', 'd_intellicence_important', 'd_funny_important', 'd_ambtition_important', 'd_shared_interests_important', 'attractive', 'sincere', '

In [4]:
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('\r', '', regex=False)
df.columns = df.columns.str.replace('\n', '', regex=False)

def safe_to_int(x):
    if isinstance(x, bytes):
        return int(x.decode("utf-8"))
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("b'") and s.endswith("'"):
            s = s[2:-1]      
        return int(s)
    else:
        return int(x)

for col in ["decision", "decision_o"]:
    df[col] = df[col].map(safe_to_int)

print(df[["decision", "decision_o"]].head())
print(df[["decision", "decision_o"]].dtypes)

   decision  decision_o
0         1           0
1         1           0
2         1           1
3         1           1
4         1           1
decision      int64
decision_o    int64
dtype: object


In [5]:
print("columns contain _o：")
print([c for c in df.columns if "_o" in c])
print("columns contain partner：")
print([c for c in df.columns if "partner" in c])

columns contain _o：
['age_o', 'race_o', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'd_pref_o_attractive', 'd_pref_o_sincere', 'd_pref_o_intelligence', 'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o', 'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o', 'd_ambitous_o', 'd_shared_interests_o', 'decision_o']
columns contain partner：
['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner', 'd_attractive_partner', 'd_sincere_partner', 'd_intelligence_partner', 'd_funny_partner', 'd_ambition_partner', 'd_shared_interests_partner']


In [6]:
# evaluate partner（decision）
features_decision = [
    "attractive_partner",
    "sincere_partner",
    "intelligence_partner",
    "funny_partner",
    "ambition_partner",
    "shared_interests_partner",
]

In [7]:
# ---------- decision ----------
cols_dec = features_decision + ["decision"]
df_dec = df[cols_dec].dropna()

print("Original data size", df.shape[0], " After cleaning (decision) data size:", df_dec.shape[0])

X_dec = df_dec[features_decision].astype(float)
y_dec = df_dec["decision"]

X_train_dec, X_test_dec, y_train_dec, y_test_dec = train_test_split(
    X_dec, y_dec, test_size=0.3, random_state=42, stratify=y_dec
)

# Logistic Regression
log_reg_pipe_dec = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
log_reg_pipe_dec.fit(X_train_dec, y_train_dec)
y_pred_dec_lr = log_reg_pipe_dec.predict(X_test_dec)
y_prob_dec_lr = log_reg_pipe_dec.predict_proba(X_test_dec)[:, 1]

print("\n=== decision: Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test_dec, y_pred_dec_lr))
print("F1:", f1_score(y_test_dec, y_pred_dec_lr))
print("ROC-AUC:", roc_auc_score(y_test_dec, y_prob_dec_lr))
print(classification_report(y_test_dec, y_pred_dec_lr))

Original data size 8378  After cleaning (decision) data size: 7040

=== decision: Logistic Regression ===
Accuracy: 0.7476325757575758
F1: 0.6993795826283136
ROC-AUC: 0.8253947725596399
              precision    recall  f1-score   support

           0       0.77      0.80      0.78      1201
           1       0.72      0.68      0.70       911

    accuracy                           0.75      2112
   macro avg       0.74      0.74      0.74      2112
weighted avg       0.75      0.75      0.75      2112



In [8]:
# Random Forest
rf_dec = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf_dec.fit(X_train_dec, y_train_dec)
y_pred_dec_rf = rf_dec.predict(X_test_dec)
y_prob_dec_rf = rf_dec.predict_proba(X_test_dec)[:, 1]

print("\n=== decision: Random Forest ===")
print("Accuracy:", accuracy_score(y_test_dec, y_pred_dec_rf))
print("F1:", f1_score(y_test_dec, y_pred_dec_rf))
print("ROC-AUC:", roc_auc_score(y_test_dec, y_prob_dec_rf))
print(classification_report(y_test_dec, y_pred_dec_rf))



=== decision: Random Forest ===
Accuracy: 0.7135416666666666
F1: 0.658770445572476
ROC-AUC: 0.7848353594836356
              precision    recall  f1-score   support

           0       0.74      0.77      0.75      1201
           1       0.68      0.64      0.66       911

    accuracy                           0.71      2112
   macro avg       0.71      0.70      0.71      2112
weighted avg       0.71      0.71      0.71      2112



In [9]:
for col in ["decision", "decision_o"]:
    if df[col].dtype != "int64" and df2[col].dtype != "int32":
        df[col] = df2[col].astype(str).astype(int)
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('\r', '', regex=False)
df.columns = df.columns.str.replace('\n', '', regex=False)

print([c for c in df.columns if "_o"   in c])  

features_decision_o = [
    "attractive_o",
    "sinsere_o",
    "intelligence_o",
    "funny_o",
    "ambitous_o",           
    "shared_interests_o",
]

cols_deco = features_decision_o + ["decision_o"]
df_deco = df[cols_deco].dropna()

print("\n Original data size:", df.shape[0], " After cleaning (decision_o) data size:", df_deco.shape[0])

X_deco = df_deco[features_decision_o].astype(float)
y_deco = df_deco["decision_o"]

['age_o', 'race_o', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'd_pref_o_attractive', 'd_pref_o_sincere', 'd_pref_o_intelligence', 'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o', 'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o', 'd_ambitous_o', 'd_shared_interests_o', 'decision_o']

 Original data size: 8378  After cleaning (decision_o) data size: 7031


In [10]:
X_train_deco, X_test_deco, y_train_deco, y_test_deco = train_test_split(
    X_deco, y_deco, test_size=0.3, random_state=42, stratify=y_deco
)

# Logistic Regression
log_reg_pipe_deco = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
log_reg_pipe_deco.fit(X_train_deco, y_train_deco)
y_pred_deco_lr = log_reg_pipe_deco.predict(X_test_deco)
y_prob_deco_lr = log_reg_pipe_deco.predict_proba(X_test_deco)[:, 1]

print("\n=== decision_o: Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test_deco, y_pred_deco_lr))
print("F1:", f1_score(y_test_deco, y_pred_deco_lr))
print("ROC-AUC:", roc_auc_score(y_test_deco, y_prob_deco_lr))
print(classification_report(y_test_deco, y_pred_deco_lr))




=== decision_o: Logistic Regression ===
Accuracy: 0.7568720379146919
F1: 0.7086882453151618
ROC-AUC: 0.8340173708606423
              precision    recall  f1-score   support

           0       0.77      0.81      0.79      1199
           1       0.73      0.68      0.71       911

    accuracy                           0.76      2110
   macro avg       0.75      0.75      0.75      2110
weighted avg       0.76      0.76      0.76      2110



In [11]:
# Random Forest
rf_deco = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf_deco.fit(X_train_deco, y_train_deco)
y_pred_deco_rf = rf_deco.predict(X_test_deco)
y_prob_deco_rf = rf_deco.predict_proba(X_test_deco)[:, 1]

print("\n=== decision_o: Random Forest ===")
print("Accuracy:", accuracy_score(y_test_deco, y_pred_deco_rf))
print("F1:", f1_score(y_test_deco, y_pred_deco_rf))
print("ROC-AUC:", roc_auc_score(y_test_deco, y_prob_deco_rf))
print(classification_report(y_test_deco, y_pred_deco_rf))



=== decision_o: Random Forest ===
Accuracy: 0.7298578199052133
F1: 0.6840354767184036
ROC-AUC: 0.7969209613939168
              precision    recall  f1-score   support

           0       0.76      0.77      0.76      1199
           1       0.69      0.68      0.68       911

    accuracy                           0.73      2110
   macro avg       0.72      0.72      0.72      2110
weighted avg       0.73      0.73      0.73      2110

