## 2.Feature importance

In [1]:
from scipy.io import arff
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [2]:
data, meta = arff.loadarff("speeddating.arff")
df = pd.DataFrame(data)

def safe_to_int(x):
    if isinstance(x, bytes):
        return int(x.decode("utf-8"))
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("b'") and s.endswith("'"):
            s = s[2:-1]      
        return int(s)
    else:
        return int(x)

for col in ["decision", "decision_o"]:
    df[col] = df[col].map(safe_to_int)

print(df[["decision", "decision_o"]].head())
print(df[["decision", "decision_o"]].dtypes)

   decision  decision_o
0         1           0
1         1           0
2         1           1
3         1           1
4         1           1
decision      int64
decision_o    int64
dtype: object


In [3]:
features_decision_base = [
    "attractive_partner",
    "sincere_partner",
    "intelligence_partner",
    "funny_partner",
    "ambition_partner",
    "shared_interests_partner",
]

features_decision_o_base = [
    "attractive_o",
    "sinsere_o",
    "intelligence_o",
    "funny_o",
    "ambitous_o",            
    "shared_interests_o",
]

In [4]:
def show_feature_importance(df, feature_list, target_col, label):
    print(f"\n===== Feature Importance for {label} =====")

    sub = df[feature_list + [target_col]].dropna()
    X = sub[feature_list].astype(float)
    y = sub[target_col]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # ---------- Logistic Regression ----------
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_scaled, y)
    lr_coef = lr.coef_[0]

    print("\n-- Logistic Regression Coefficients (standardized) --")
    lr_importance = pd.DataFrame({
        "feature": feature_list,
        "importance": lr_coef
    }).sort_values("importance", ascending=False)
    print(lr_importance)

    # ---------- Random Forest ----------
    rf = RandomForestClassifier(n_estimators=300, random_state=42)
    rf.fit(X, y)
    rf_imp = rf.feature_importances_

    print("\n-- Random Forest Feature Importances --")
    rf_importance = pd.DataFrame({
        "feature": feature_list,
        "importance": rf_imp
    }).sort_values("importance", ascending=False)
    print(rf_importance)


In [5]:
show_feature_importance(df, features_decision_base, "decision", "decision baseline")


===== Feature Importance for decision baseline =====

-- Logistic Regression Coefficients (standardized) --
                    feature  importance
0        attractive_partner    1.067682
5  shared_interests_partner    0.576657
3             funny_partner    0.517945
2      intelligence_partner    0.043375
1           sincere_partner   -0.193530
4          ambition_partner   -0.299748

-- Random Forest Feature Importances --
                    feature  importance
0        attractive_partner    0.258599
5  shared_interests_partner    0.183196
3             funny_partner    0.166673
4          ambition_partner    0.138926
1           sincere_partner    0.135570
2      intelligence_partner    0.117035


In [6]:
show_feature_importance(df, features_decision_o_base, "decision_o", "decision_o baseline")


===== Feature Importance for decision_o baseline =====

-- Logistic Regression Coefficients (standardized) --
              feature  importance
0        attractive_o    1.067911
5  shared_interests_o    0.575888
3             funny_o    0.517960
2      intelligence_o    0.040599
1           sinsere_o   -0.191567
4          ambitous_o   -0.299551

-- Random Forest Feature Importances --
              feature  importance
0        attractive_o    0.258363
5  shared_interests_o    0.183742
3             funny_o    0.165956
4          ambitous_o    0.138627
1           sinsere_o    0.136196
2      intelligence_o    0.117115
