## 7.Complete model

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, roc_auc_score, recall_score, precision_score, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [2]:
data, meta = arff.loadarff("speeddating.arff")
df = pd.DataFrame(data)

def safe_to_int(x):
    if isinstance(x, bytes):
        return int(x.decode("utf-8"))
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("b'") and s.endswith("'"):
            s = s[2:-1]      
        return int(s)
    else:
        return int(x)

for col in ["decision", "decision_o"]:
    df[col] = df[col].map(safe_to_int)

print(df[["decision", "decision_o"]].head())
print(df[["decision", "decision_o"]].dtypes)

   decision  decision_o
0         1           0
1         1           0
2         1           1
3         1           1
4         1           1
decision      int64
decision_o    int64
dtype: object


In [3]:
features_decision_base = [
    "attractive_partner",
    "sincere_partner",
    "intelligence_partner",
    "funny_partner",
    "ambition_partner",
    "shared_interests_partner",
]

self_ratings = [
    "attractive",            # rate yourself
    "sincere",
    "intelligence",
    "funny",
    "ambition",
]

self_prefs = [
    "attractive_important",  # what do you look for in a partner
    "sincere_important",
    "intellicence_important",  
    "funny_important",
    "ambtition_important",     
    "shared_interests_important",
]

more_features = [
    "interests_correlate",
    "d_age",
    "samerace",
    "importance_same_race",
    "expected_happy_with_sd_people",
    "expected_num_interested_in_me",
    "expected_num_matches",
    "like",
    "guess_prob_liked",
    "met",
]

In [4]:
features_decision_complete = features_decision_base   + self_ratings + self_prefs + more_features # complete model
features_decision_basic = features_decision_base   + self_ratings + self_prefs  ### basic model
print("\nDecision baseline features:", features_decision_base)
print("\nDecision basic features:", features_decision_basic)
print("Decision complete features:", features_decision_complete)


Decision baseline features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner']

Decision basic features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important']
Decision complete features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important', 'interests_correlate', 'd_age', 'samerace', 'importance_same_race', 'expected_happy_with_sd_people', 'expected_num

In [5]:
def run_models(df, feature_list, target_col, label, rf_threshold=0.5):
    cols = feature_list + [target_col]
    sub = df[cols].dropna()  
    
    X = sub[feature_list].astype(float)
    y = sub[target_col]

    print(f"\n===== {label} =====")
    print("Data size after cleaning:", sub.shape[0])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Logistic Regression
    log_reg = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    log_reg.fit(X_train, y_train)
    y_pred_lr = log_reg.predict(X_test)
    y_prob_lr = log_reg.predict_proba(X_test)[:, 1]

    print("\n-- Logistic Regression --")
    print("Accuracy:", accuracy_score(y_test, y_pred_lr))
    print("F1:",       f1_score(y_test, y_pred_lr))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_lr))
    # Random Forest
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]
    y_pred_rf = (y_prob_rf >= rf_threshold).astype(int)

    print(f"\n-- Random Forest (thr={rf_threshold}) --")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf))
    print("F1:",       f1_score(y_test, y_pred_rf))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_rf))


In [6]:
# 1) decision, basic 
run_models(df, features_decision_basic, "decision", "decision (basic)")

# 2) decision, complete 
run_models(df, features_decision_complete, "decision", "decision (+ self rating & preferences and more features)")



===== decision (basic) =====
Data size after cleaning: 6906

-- Logistic Regression --
Accuracy: 0.7495173745173745
F1: 0.7079347214406303
ROC-AUC: 0.8215033960974544

-- Random Forest (thr=0.5) --
Accuracy: 0.7852316602316602
F1: 0.7478753541076487
ROC-AUC: 0.8752452737050449

===== decision (+ self rating & preferences and more features) =====
Data size after cleaning: 1237

-- Logistic Regression --
Accuracy: 0.7795698924731183
F1: 0.7320261437908496
ROC-AUC: 0.8642643540669857

-- Random Forest (thr=0.5) --
Accuracy: 0.8494623655913979
F1: 0.8041958041958042
ROC-AUC: 0.9210077751196173


In [7]:
def show_feature_importance(df, feature_list, target_col, label):
    print(f"\n===== Feature Importance for {label} =====")

    sub = df[feature_list + [target_col]].dropna()
    X = sub[feature_list].astype(float)
    y = sub[target_col]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # ---------- Logistic Regression ----------
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_scaled, y)
    lr_coef = lr.coef_[0]

    print("\n-- Logistic Regression Coefficients (standardized) --")
    lr_importance = pd.DataFrame({
        "feature": feature_list,
        "importance": lr_coef
    }).sort_values("importance", ascending=False)
    print(lr_importance)

    # ---------- Random Forest ----------
    rf = RandomForestClassifier(n_estimators=300, random_state=42)
    rf.fit(X, y)
    rf_imp = rf.feature_importances_

    print("\n-- Random Forest Feature Importances --")
    rf_importance = pd.DataFrame({
        "feature": feature_list,
        "importance": rf_imp
    }).sort_values("importance", ascending=False)
    print(rf_importance)


In [8]:
show_feature_importance(df, features_decision_base, "decision", "decision_basic")


===== Feature Importance for decision_basic =====

-- Logistic Regression Coefficients (standardized) --
                    feature  importance
0        attractive_partner    1.067682
5  shared_interests_partner    0.576657
3             funny_partner    0.517945
2      intelligence_partner    0.043375
1           sincere_partner   -0.193530
4          ambition_partner   -0.299748

-- Random Forest Feature Importances --
                    feature  importance
0        attractive_partner    0.258599
5  shared_interests_partner    0.183196
3             funny_partner    0.166673
4          ambition_partner    0.138926
1           sincere_partner    0.135570
2      intelligence_partner    0.117035


In [9]:
show_feature_importance(df, features_decision_complete, "decision", "decision_complete")


===== Feature Importance for decision_complete =====

-- Logistic Regression Coefficients (standardized) --
                          feature  importance
24                           like    1.602445
0              attractive_partner    1.014890
16     shared_interests_important    0.691443
14                funny_important    0.617105
12              sincere_important    0.556226
25               guess_prob_liked    0.533873
11           attractive_important    0.387515
23           expected_num_matches    0.304149
13         intellicence_important    0.302411
3                   funny_partner    0.239474
9                           funny    0.200255
5        shared_interests_partner    0.103269
21  expected_happy_with_sd_people    0.097377
8                    intelligence    0.085144
15            ambtition_important    0.070322
19                       samerace    0.054567
22  expected_num_interested_in_me    0.017131
10                       ambition    0.009848
18               

In [10]:
def compute_vif(df, feature_list):
    """
    df: your dataframe (after cleaning)
    feature_list: list of feature names you want to test
    """
    # Subset
    X = df[feature_list].dropna().astype(float)

    # Standardize to improve numerical stability
    X_scaled = StandardScaler().fit_transform(X)

    # Compute VIF
    vif_data = pd.DataFrame()
    vif_data["feature"] = feature_list
    vif_data["VIF"] = [
        variance_inflation_factor(X_scaled, i)
        for i in range(X_scaled.shape[1])
    ]

    return vif_data

In [11]:
vif_full = compute_vif(df, features_decision_complete)
print(vif_full)

                          feature        VIF
0              attractive_partner   2.011872
1                 sincere_partner   1.985822
2            intelligence_partner   2.320038
3                   funny_partner   2.509931
4                ambition_partner   1.787224
5        shared_interests_partner   2.483558
6                      attractive   2.062769
7                         sincere   1.895553
8                    intelligence   1.997377
9                           funny   1.577636
10                       ambition   2.095113
11           attractive_important  12.356225
12              sincere_important   5.492183
13         intellicence_important   4.998042
14                funny_important   4.192010
15            ambtition_important   3.249428
16     shared_interests_important   4.626467
17            interests_correlate   1.106615
18                          d_age   1.063018
19                       samerace   1.091931
20           importance_same_race   1.115281
21  expect

In [12]:
def run_pca(df, feature_list, n_components=2):

    X = df[feature_list].dropna().astype(float)

    valid_index = X.index

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)

    print("\n=== PCA explained variance ratio ===")
    print(pca.explained_variance_ratio_)
    print("Total explained variance:", pca.explained_variance_ratio_.sum())

    loadings = pd.DataFrame(
        pca.components_.T,
        columns=[f"PC{i+1}" for i in range(n_components)],
        index=feature_list
    )
    print("\n=== PCA Loadings ===")
    print(loadings)
    
    df["pca_trait1"] = np.nan
    df["pca_trait2"] = np.nan

    df.loc[valid_index, "pca_trait1"] = X_pca[:, 0]
    if n_components > 1:
        df.loc[valid_index, "pca_trait2"] = X_pca[:, 1]

    print("\nPCA traits added to df: ['pca_trait1', 'pca_trait2']")

    return ["pca_trait1", "pca_trait2"], loadings


In [13]:
pca_traits, pca_loadings = run_pca(df, features_decision_complete)


=== PCA explained variance ratio ===
[0.1867502  0.10392325]
Total explained variance: 0.2906734463558368

=== PCA Loadings ===
                                    PC1       PC2
attractive_partner             0.259216 -0.193001
sincere_partner                0.274071 -0.187203
intelligence_partner           0.280089 -0.171791
funny_partner                  0.311579 -0.234857
ambition_partner               0.256596 -0.168489
shared_interests_partner       0.323788 -0.167259
attractive                     0.213395  0.357518
sincere                        0.098687 -0.120866
intelligence                   0.227721  0.274224
funny                          0.162817  0.229196
ambition                       0.215696  0.233719
attractive_important           0.054811  0.308683
sincere_important             -0.040668 -0.242554
intellicence_important        -0.114116 -0.174385
funny_important                0.029485  0.062049
ambtition_important            0.049202  0.019457
shared_interests_impo

In [14]:
def get_xy(df, feature_list, target_col):
    sub = df[feature_list + [target_col]].dropna()
    X = sub[feature_list].astype(float)
    y = sub[target_col].astype(int)
    return X, y

In [15]:
X_dec, y_dec = get_xy(df, features_decision_complete, "decision")

param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)

rf_search = RandomizedSearchCV(
    rf, param_dist, 
    n_iter=20,
    scoring='f1',
    cv=3,
    n_jobs=-1
)

rf_search.fit(X_dec, y_dec)
print("Best params:", rf_search.best_params_)
print("Best F1:", rf_search.best_score_)


Best params: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 5}
Best F1: 0.6982241326350088


In [16]:
def run_models_newcomplete(df, feature_list, target_col, label):
    cols = feature_list + [target_col]
    sub = df[cols].dropna()  
    
    X = sub[feature_list].astype(float)
    y = sub[target_col]

    print(f"\n===== {label} =====")
    print("Data size after cleaning:", sub.shape[0])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    # Random Forest
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]

    print("\n-- Random Forest (tuned) --")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf))
    print("F1:",       f1_score(y_test, y_pred_rf))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_rf))


In [17]:
run_models(df, features_decision_complete,  "decision",   "decision + self & prefs + more features (before tuned RF)")
run_models_newcomplete(df, features_decision_complete,  "decision",   "decision + self & prefs + more features (tuned RF)")


===== decision + self & prefs + more features (before tuned RF) =====
Data size after cleaning: 1237

-- Logistic Regression --
Accuracy: 0.7795698924731183
F1: 0.7320261437908496
ROC-AUC: 0.8642643540669857

-- Random Forest (thr=0.5) --
Accuracy: 0.8494623655913979
F1: 0.8041958041958042
ROC-AUC: 0.9210077751196173

===== decision + self & prefs + more features (tuned RF) =====
Data size after cleaning: 1237

-- Random Forest (tuned) --
Accuracy: 0.8279569892473119
F1: 0.7762237762237763
ROC-AUC: 0.9115430622009569


In [18]:
def get_train_test(df, feature_list, target_col, test_size=0.3):
    sub = df[feature_list + [target_col]].dropna()
    X = sub[feature_list].astype(float)
    y = sub[target_col].astype(int)

    return train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )


In [19]:
features = features_decision_complete
X_train, X_test, y_train, y_test = get_train_test(df, features, "decision")


In [20]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
y_prob_rf = rf.predict_proba(X_test)[:, 1]

In [22]:
thresholds = np.linspace(0.1, 0.9, 17)  # 0.1, 0.15, ..., 0.9

records = []

for thr in thresholds:
    y_pred_thr = (y_prob_rf >= thr).astype(int)
    
    acc = accuracy_score(y_test, y_pred_thr)
    prec = precision_score(y_test, y_pred_thr, zero_division=0)
    rec = recall_score(y_test, y_pred_thr, zero_division=0)
    f1 = f1_score(y_test, y_pred_thr, zero_division=0)
    
    records.append({
        "threshold": thr,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "F1": f1
    })

thr_df = pd.DataFrame(records)
thr_df_sorted = thr_df.sort_values("F1", ascending=False)

thr_df_sorted


Unnamed: 0,threshold,accuracy,precision,recall,F1
7,0.45,0.849462,0.807692,0.828947,0.818182
6,0.4,0.830645,0.754286,0.868421,0.807339
5,0.35,0.819892,0.722513,0.907895,0.804665
8,0.5,0.849462,0.858209,0.756579,0.804196
9,0.55,0.849462,0.880952,0.730263,0.798561
4,0.3,0.784946,0.671429,0.927632,0.779006
3,0.25,0.763441,0.640351,0.960526,0.768421
2,0.2,0.739247,0.614108,0.973684,0.753181
10,0.6,0.822581,0.877193,0.657895,0.75188
11,0.65,0.80914,0.90099,0.598684,0.719368


In [23]:
run_models(df, features_decision_complete, "decision",
           "decision + self & prefs + more features (thr=0.5)")
run_models(df, features_decision_complete, "decision",
           "decision + self & prefs + more features (thr=0.45)",
           rf_threshold=0.45)



===== decision + self & prefs + more features (thr=0.5) =====
Data size after cleaning: 1237

-- Logistic Regression --
Accuracy: 0.7795698924731183
F1: 0.7320261437908496
ROC-AUC: 0.8642643540669857

-- Random Forest (thr=0.5) --
Accuracy: 0.8494623655913979
F1: 0.8041958041958042
ROC-AUC: 0.9210077751196173

===== decision + self & prefs + more features (thr=0.45) =====
Data size after cleaning: 1237

-- Logistic Regression --
Accuracy: 0.7795698924731183
F1: 0.7320261437908496
ROC-AUC: 0.8642643540669857

-- Random Forest (thr=0.45) --
Accuracy: 0.8440860215053764
F1: 0.8129032258064516
ROC-AUC: 0.9210077751196173
