## 6.Basic ( baseline + self_rating + self_pref )

In [1]:
from scipy.io import arff
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


In [2]:
data, meta = arff.loadarff("speeddating.arff")
df = pd.DataFrame(data)

def safe_to_int(x):
    if isinstance(x, bytes):
        return int(x.decode("utf-8"))
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("b'") and s.endswith("'"):
            s = s[2:-1]      
        return int(s)
    else:
        return int(x)

for col in ["decision", "decision_o"]:
    df[col] = df[col].map(safe_to_int)

print(df[["decision", "decision_o"]].head())
print(df[["decision", "decision_o"]].dtypes)

   decision  decision_o
0         1           0
1         1           0
2         1           1
3         1           1
4         1           1
decision      int64
decision_o    int64
dtype: object


In [3]:
features_decision_base = [
    "attractive_partner",
    "sincere_partner",
    "intelligence_partner",
    "funny_partner",
    "ambition_partner",
    "shared_interests_partner",
]

features_decision_o_base = [
    "attractive_o",
    "sinsere_o",
    "intelligence_o",
    "funny_o",
    "ambitous_o",            
    "shared_interests_o",
]

# Step2 
self_ratings = [
    "attractive",            # rate yourself
    "sincere",
    "intelligence",
    "funny",
    "ambition",
]

self_prefs = [
    "attractive_important",  # what do you look for in a partner
    "sincere_important",
    "intellicence_important",  
    "funny_important",
    "ambtition_important",     
    "shared_interests_important",
]



In [4]:
features_decision_plus   = features_decision_base   + self_ratings + self_prefs
features_decision_o_plus = features_decision_o_base + self_ratings + self_prefs

print("\nDecision baseline features:", features_decision_base)
print("Decision +self features:", features_decision_plus)
print("\nDecision_o baseline features:", features_decision_o_base)
print("Decision_o +self features:", features_decision_o_plus)



Decision baseline features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner']
Decision +self features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important']

Decision_o baseline features: ['attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o']
Decision_o +self features: ['attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important']


In [5]:
def run_models(df, feature_list, target_col, label):
    cols = feature_list + [target_col]
    sub = df[cols].dropna()  
    
    X = sub[feature_list].astype(float)
    y = sub[target_col]

    print(f"\n===== {label} =====")
    print("Data size after cleaning:", sub.shape[0])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Logistic Regression
    log_reg = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    log_reg.fit(X_train, y_train)
    y_pred_lr = log_reg.predict(X_test)
    y_prob_lr = log_reg.predict_proba(X_test)[:, 1]

    print("\n-- Logistic Regression --")
    print("Accuracy:", accuracy_score(y_test, y_pred_lr))
    print("F1:",       f1_score(y_test, y_pred_lr))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_lr))
    # Random Forest
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]

    print("\n-- Random Forest --")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf))
    print("F1:",       f1_score(y_test, y_pred_rf))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_rf))


In [6]:
# 1) decision, baseline 
run_models(df, features_decision_base, "decision", "decision (baseline)")

# 2) decision, baseline 
run_models(df, features_decision_plus, "decision", "decision (+ self rating & preferences)")

# 3) decision_o, baseline 
run_models(df, features_decision_o_base, "decision_o", "decision_o (baseline)")

# 4) decision_o, baseline
run_models(df, features_decision_o_plus, "decision_o", "decision_o (+ self rating & preferences)")



===== decision (baseline) =====
Data size after cleaning: 7040

-- Logistic Regression --
Accuracy: 0.7476325757575758
F1: 0.6993795826283136
ROC-AUC: 0.8253947725596399

-- Random Forest --
Accuracy: 0.7135416666666666
F1: 0.658770445572476
ROC-AUC: 0.7848353594836356

===== decision (+ self rating & preferences) =====
Data size after cleaning: 6906

-- Logistic Regression --
Accuracy: 0.7495173745173745
F1: 0.7079347214406303
ROC-AUC: 0.8215033960974544

-- Random Forest --
Accuracy: 0.7857142857142857
F1: 0.7480136208853575
ROC-AUC: 0.8752452737050451

===== decision_o (baseline) =====
Data size after cleaning: 7031

-- Logistic Regression --
Accuracy: 0.7568720379146919
F1: 0.7086882453151618
ROC-AUC: 0.8340173708606423

-- Random Forest --
Accuracy: 0.7298578199052133
F1: 0.6840354767184036
ROC-AUC: 0.7969209613939168

===== decision_o (+ self rating & preferences) =====
Data size after cleaning: 6915

-- Logistic Regression --
Accuracy: 0.7450602409638554
F1: 0.6933333333333334
