## 4.baseline + partner_pref

In [1]:
from scipy.io import arff
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [2]:
data, meta = arff.loadarff("speeddating.arff")
df = pd.DataFrame(data)
df

Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,race,race_o,samerace,...,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met,decision,decision_o,match
0,b'0',1.0,b'female',21.0,27.0,6.0,b'[4-6]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'0',b'0'
1,b'0',1.0,b'female',21.0,22.0,1.0,b'[0-1]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,5.0,b'[6-8]',b'[5-6]',1.0,b'1',b'0',b'0'
2,b'1',1.0,b'female',21.0,22.0,1.0,b'[0-1]',b'Asian/Pacific Islander/Asian-American',b'Asian/Pacific Islander/Asian-American',b'1',...,b'[0-3]',b'[3-5]',7.0,,b'[6-8]',b'[0-4]',1.0,b'1',b'1',b'1'
3,b'0',1.0,b'female',21.0,23.0,2.0,b'[2-3]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'1',b'1'
4,b'0',1.0,b'female',21.0,24.0,3.0,b'[2-3]',b'Asian/Pacific Islander/Asian-American',b'Latino/Hispanic American',b'0',...,b'[0-3]',b'[3-5]',6.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'1',b'1'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,b'1',21.0,b'male',25.0,26.0,1.0,b'[0-1]',b'European/Caucasian-American',b'Latino/Hispanic American',b'0',...,b'[0-3]',b'[3-5]',2.0,5.0,b'[0-5]',b'[5-6]',0.0,b'0',b'1',b'0'
8374,b'1',21.0,b'male',25.0,24.0,1.0,b'[0-1]',b'European/Caucasian-American',b'Other',b'0',...,b'[0-3]',b'[3-5]',4.0,4.0,b'[0-5]',b'[0-4]',0.0,b'0',b'0',b'0'
8375,b'1',21.0,b'male',25.0,29.0,4.0,b'[4-6]',b'European/Caucasian-American',b'Latino/Hispanic American',b'0',...,b'[0-3]',b'[3-5]',6.0,5.0,b'[6-8]',b'[5-6]',0.0,b'0',b'0',b'0'
8376,b'1',21.0,b'male',25.0,22.0,3.0,b'[2-3]',b'European/Caucasian-American',b'Asian/Pacific Islander/Asian-American',b'0',...,b'[0-3]',b'[3-5]',5.0,5.0,b'[0-5]',b'[5-6]',0.0,b'0',b'1',b'0'


In [3]:
print(meta.names())

['has_null', 'wave', 'gender', 'age', 'age_o', 'd_age', 'd_d_age', 'race', 'race_o', 'samerace', 'importance_same_race', 'importance_same_religion', 'd_importance_same_race', 'd_importance_same_religion', 'field', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'd_pref_o_attractive', 'd_pref_o_sincere', 'd_pref_o_intelligence', 'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o', 'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o', 'd_ambitous_o', 'd_shared_interests_o', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important', 'd_attractive_important', 'd_sincere_important', 'd_intellicence_important', 'd_funny_important', 'd_ambtition_important', 'd_shared_interests_important', 'attractive', 'sincere', '

In [4]:
def safe_to_int(x):
    if isinstance(x, bytes):
        return int(x.decode("utf-8"))
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("b'") and s.endswith("'"):
            s = s[2:-1]      
        return int(s)
    else:
        return int(x)

for col in ["decision", "decision_o"]:
    df[col] = df[col].map(safe_to_int)

print(df[["decision", "decision_o"]].head())
print(df[["decision", "decision_o"]].dtypes)

   decision  decision_o
0         1           0
1         1           0
2         1           1
3         1           1
4         1           1
decision      int64
decision_o    int64
dtype: object


In [5]:
features_decision_base = [
    "attractive_partner",
    "sincere_partner",
    "intelligence_partner",
    "funny_partner",
    "ambition_partner",
    "shared_interests_partner",
]

features_decision_o_base = [
    "attractive_o",
    "sinsere_o",
    "intelligence_o",
    "funny_o",
    "ambitous_o",            
    "shared_interests_o",
]

partner_pref = [
    "pref_o_attractive", 
    "pref_o_sincere", 
    "pref_o_intelligence", 
    "pref_o_funny", 
    "pref_o_ambitious", 
    "pref_o_shared_interests",
]

In [6]:
features_decision_plus   = features_decision_base + partner_pref
features_decision_o_plus = features_decision_o_base + partner_pref

print("\nDecision baseline features:", features_decision_base)
print("Decision +self features:", features_decision_plus)
print("\nDecision_o baseline features:", features_decision_o_base)
print("Decision_o +self features:", features_decision_o_plus)


Decision baseline features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner']
Decision +self features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']

Decision_o baseline features: ['attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o']
Decision_o +self features: ['attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']


In [7]:
def run_models(df, feature_list, target_col, label):
    cols = feature_list + [target_col]
    sub = df[cols].dropna()  
    
    X = sub[feature_list].astype(float)
    y = sub[target_col]

    print(f"\n===== {label} =====")
    print("Data size after cleaning:", sub.shape[0])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Logistic Regression
    log_reg = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    log_reg.fit(X_train, y_train)
    y_pred_lr = log_reg.predict(X_test)
    y_prob_lr = log_reg.predict_proba(X_test)[:, 1]

    print("\n-- Logistic Regression --")
    print("Accuracy:", accuracy_score(y_test, y_pred_lr))
    print("F1:",       f1_score(y_test, y_pred_lr))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_lr))
    # Random Forest
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]

    print("\n-- Random Forest --")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf))
    print("F1:",       f1_score(y_test, y_pred_rf))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_rf))


In [8]:
# 1) decision, baseline 
run_models(df, features_decision_base, "decision", "decision (baseline)")

# 2) decision, baseline 
run_models(df, features_decision_plus, "decision", "decision (+ partner_preferences)")

# 3) decision_o, baseline 
run_models(df, features_decision_o_base, "decision_o", "decision_o (baseline)")

# 4) decision_o, baseline
run_models(df, features_decision_o_plus, "decision_o", "decision_o (+ parner_preferences)")



===== decision (baseline) =====
Data size after cleaning: 7040

-- Logistic Regression --
Accuracy: 0.7476325757575758
F1: 0.6993795826283136
ROC-AUC: 0.8253947725596399

-- Random Forest --
Accuracy: 0.7135416666666666
F1: 0.658770445572476
ROC-AUC: 0.7848353594836356

===== decision (+ partner_preferences) =====
Data size after cleaning: 6935

-- Logistic Regression --
Accuracy: 0.7462758289283998
F1: 0.7013574660633484
ROC-AUC: 0.8302198837461255

-- Random Forest --
Accuracy: 0.7424315233061028
F1: 0.6908881199538639
ROC-AUC: 0.8207843839204123

===== decision_o (baseline) =====
Data size after cleaning: 7031

-- Logistic Regression --
Accuracy: 0.7568720379146919
F1: 0.7086882453151618
ROC-AUC: 0.8340173708606423

-- Random Forest --
Accuracy: 0.7298578199052133
F1: 0.6840354767184036
ROC-AUC: 0.7969209613939168

===== decision_o (+ parner_preferences) =====
Data size after cleaning: 6924

-- Logistic Regression --
Accuracy: 0.7343599615014437
F1: 0.6860068259385665
ROC-AUC: 0.80