## 8.Sensitivity analysis

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, roc_auc_score, recall_score, precision_score, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [2]:
data, meta = arff.loadarff("speeddating.arff")
df = pd.DataFrame(data)

def safe_to_int(x):
    if isinstance(x, bytes):
        return int(x.decode("utf-8"))
    elif isinstance(x, str):
        s = x.strip()
        if s.startswith("b'") and s.endswith("'"):
            s = s[2:-1]      
        return int(s)
    else:
        return int(x)

for col in ["decision", "decision_o"]:
    df[col] = df[col].map(safe_to_int)

print(df[["decision", "decision_o"]].head())
print(df[["decision", "decision_o"]].dtypes)

   decision  decision_o
0         1           0
1         1           0
2         1           1
3         1           1
4         1           1
decision      int64
decision_o    int64
dtype: object


In [3]:
features_decision_base = [
    "attractive_partner",
    "sincere_partner",
    "intelligence_partner",
    "funny_partner",
    "ambition_partner",
    "shared_interests_partner",
]

self_ratings = [
    "attractive",            # rate yourself
    "sincere",
    "intelligence",
    "funny",
    "ambition",
]

self_prefs = [
    "attractive_important",  # what do you look for in a partner
    "sincere_important",
    "intellicence_important",  
    "funny_important",
    "ambtition_important",     
    "shared_interests_important",
]

more_features = [
    "interests_correlate",
    "d_age",
    "samerace",
    "importance_same_race",
    "expected_happy_with_sd_people",
    "expected_num_interested_in_me",
    "expected_num_matches",
    "like",
    "guess_prob_liked",
    "met",
]

In [4]:
features_decision_complete = features_decision_base   + self_ratings + self_prefs + more_features # complete model
print("Decision complete features:", features_decision_complete)

Decision complete features: ['attractive_partner', 'sincere_partner', 'intelligence_partner', 'funny_partner', 'ambition_partner', 'shared_interests_partner', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition', 'attractive_important', 'sincere_important', 'intellicence_important', 'funny_important', 'ambtition_important', 'shared_interests_important', 'interests_correlate', 'd_age', 'samerace', 'importance_same_race', 'expected_happy_with_sd_people', 'expected_num_interested_in_me', 'expected_num_matches', 'like', 'guess_prob_liked', 'met']


In [5]:
def run_models(df, feature_list, target_col, label, rf_threshold=0.5):
    cols = feature_list + [target_col]
    sub = df[cols].dropna()  
    
    X = sub[feature_list].astype(float)
    y = sub[target_col]

    print(f"\n===== {label} =====")
    print("Data size after cleaning:", sub.shape[0])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Logistic Regression
    log_reg = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    log_reg.fit(X_train, y_train)
    y_pred_lr = log_reg.predict(X_test)
    y_prob_lr = log_reg.predict_proba(X_test)[:, 1]

    print("\n-- Logistic Regression --")
    print("Accuracy:", accuracy_score(y_test, y_pred_lr))
    print("F1:",       f1_score(y_test, y_pred_lr))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_lr))
    # Random Forest
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]
    y_pred_rf = (y_prob_rf >= rf_threshold).astype(int)

    print(f"\n-- Random Forest (thr={rf_threshold}) --")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf))
    print("F1:",       f1_score(y_test, y_pred_rf))
    print("ROC-AUC:",  roc_auc_score(y_test, y_prob_rf))


In [6]:
run_models(df, features_decision_complete, "decision",
           "decision + self & prefs + more features (thr=0.45)",
           rf_threshold=0.45)



===== decision + self & prefs + more features (thr=0.45) =====
Data size after cleaning: 1237

-- Logistic Regression --
Accuracy: 0.7795698924731183
F1: 0.7320261437908496
ROC-AUC: 0.8642643540669857

-- Random Forest (thr=0.45) --
Accuracy: 0.8440860215053764
F1: 0.8129032258064516
ROC-AUC: 0.9210077751196173


In [7]:
cols = features_decision_complete + ["decision"]
sub = df[cols].dropna()
X = sub[features_decision_complete].astype(float)
y = sub["decision"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)
rf.fit(X_train, y_train)

i = 23
x0 = X_test.iloc[i].copy()
base_prob = rf.predict_proba([x0])[0, 1]

def bump_feature(x, feat, delta, min_val=0, max_val=10):
    x_new = x.copy()
    x_new[feat] = np.clip(x_new[feat] + delta, min_val, max_val)
    return x_new

features_to_check = [
    "attractive_partner",
    "sincere_partner",
    "intelligence_partner",
    "funny_partner",
    "ambition_partner",
    "shared_interests_partner",
    "attractive_important",  # what do you look for in a partner
    "sincere_important",
    "intellicence_important",  
    "funny_important",
    "ambtition_important",     
    "shared_interests_important",
    "interests_correlate",
    
]

records = []
for feat in features_to_check:
    x_plus1 = bump_feature(x0, feat, +1)
    prob_plus1 = rf.predict_proba([x_plus1])[0, 1]
    records.append({
        "feature": feat,
        "base_prob": base_prob,
        "prob_plus1": prob_plus1,
        "delta_prob": prob_plus1 - base_prob,
    })

local_sens_df = pd.DataFrame(records).sort_values("delta_prob", ascending=False)
local_sens_df




Unnamed: 0,feature,base_prob,prob_plus1,delta_prob
0,attractive_partner,0.176667,0.233333,0.056667
3,funny_partner,0.176667,0.21,0.033333
12,interests_correlate,0.176667,0.203333,0.026667
8,intellicence_important,0.176667,0.19,0.013333
1,sincere_partner,0.176667,0.18,0.003333
7,sincere_important,0.176667,0.18,0.003333
11,shared_interests_important,0.176667,0.18,0.003333
2,intelligence_partner,0.176667,0.176667,0.0
5,shared_interests_partner,0.176667,0.176667,0.0
6,attractive_important,0.176667,0.176667,0.0
