In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

# Load and merge data
features = pd.read_csv("features.csv")
scores = pd.read_csv("scores.csv")
data = pd.merge(features, scores, left_on="id", right_on="Participant")

# Simulated demographic columns for testing
np.random.seed(42)
n = len(data)
data["Gender"] = np.random.choice(["Male", "Female"], size=n)
data["Race"] = np.random.choice(["White", "African American", "Hispanic"], size=n)

# Used 'Overall' as a proxy for PHQ-8 score
X = data.drop(columns=["id", "Participant", "Overall", "Excited", "Gender", "Race"])
y = data["Overall"].values

# Standardized the numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Selected top-k features using f_regression
def select_top_k(X, y, k):
    selector = SelectKBest(score_func=f_regression, k=k)
    X_new = selector.fit_transform(X, y)
    selected_columns = X.columns[selector.get_support()]
    return pd.DataFrame(X_new, columns=selected_columns), selected_columns.tolist()

# Evaluated performance metrics for demographic subgroups
def evaluate_by_group(data, preds, true):
    data = data.copy()
    data["preds"] = preds
    data["true"] = true
    for group_cols in [["Gender"], ["Race"], ["Gender", "Race"]]:
        print("\nGroup:", " x ".join(group_cols))
        for group, group_df in data.groupby(group_cols):
            r = pearsonr(group_df["preds"], group_df["true"])[0]
            re = np.mean(np.abs(group_df["preds"] - group_df["true"]) / np.max(true))
            print(f"{group}: r = {r:.4f}, RE = {re:.4f}")

# Found the group with the highest relative error
def find_worst_group(data, preds, true):
    data = data.copy()
    data["preds"] = preds
    data["true"] = true
    worst_group = None
    worst_re = -1
    for group, group_df in data.groupby(["Gender", "Race"]):
        re = np.mean(np.abs(group_df["preds"] - group_df["true"]) / np.max(true))
        if re > worst_re:
            worst_re = re
            worst_group = group
    print("\nWorst performing group:", worst_group, "with RE =", round(worst_re, 4))

# Ran model with multiple k values
for k in [5, 10, 'all']:
    print(f"\nRunning model with top {k} features")
    X_k, selected_features = select_top_k(X_scaled_df, y, k=k)
    print("Selected features:", selected_features)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    all_preds = np.zeros_like(y)

    for train_idx, test_idx in kf.split(X_k):
        X_train, X_test = X_k.iloc[train_idx], X_k.iloc[test_idx]
        y_train = y[train_idx]
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        all_preds[test_idx] = model.predict(X_test)

    avg_r = pearsonr(all_preds, y)[0]
    avg_re = np.mean(np.abs(all_preds - y) / np.max(y))
    print(f"Average r: {avg_r:.4f}")
    print(f"Average RE: {avg_re:.4f}")

    evaluate_by_group(data, all_preds, y)
    find_worst_group(data, all_preds, y)



Running model with top 5 features
Selected features: ['filler%', 'speaker_balance', 'word_length_2', 'word_length_6', '8+ words']
Average r: 0.2398
Average RE: 0.0771

Group: Gender
('Female',): r = 0.2958, RE = 0.0763
('Male',): r = 0.1586, RE = 0.0781

Group: Race
('African American',): r = 0.2273, RE = 0.0736
('Hispanic',): r = 0.2106, RE = 0.0872
('White',): r = 0.2869, RE = 0.0685

Group: Gender x Race
('Female', 'African American'): r = 0.1149, RE = 0.0744
('Female', 'Hispanic'): r = 0.3485, RE = 0.0819
('Female', 'White'): r = 0.3502, RE = 0.0729
('Male', 'African American'): r = 0.3737, RE = 0.0720
('Male', 'Hispanic'): r = 0.0843, RE = 0.0919
('Male', 'White'): r = 0.1938, RE = 0.0624

Worst performing group: ('Male', 'Hispanic') with RE = 0.0919

Running model with top 10 features
Selected features: ['filler%', 'speaker_balance', 'total_word_count', 'word_length_2', 'word_length_3', 'word_length_4', 'word_length_6', 'word_length_7', '8+ words', 'maximum_sentence_sentiment']


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

# Convert to tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

# Defined a basic feedforward neural network
class SimpleNN(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
nn_preds = np.zeros_like(y)

# Ran 5-fold CV with the model
for train_idx, test_idx in kf.split(X_tensor):
    X_tr, X_te = X_tensor[train_idx], X_tensor[test_idx]
    y_tr, y_te = y_tensor[train_idx], y_tensor[test_idx]

    model = SimpleNN(X_tensor.shape[1])
    opt = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    for _ in range(1000):  # Reduced training epochs
        model.train()
        opt.zero_grad()
        out = model(X_tr)
        loss = loss_fn(out, y_tr)
        loss.backward()
        opt.step()

    model.eval()
    with torch.no_grad():
        nn_preds[test_idx] = model(X_te).squeeze().numpy()

# Evaluated performance
r = pearsonr(nn_preds, y)[0]
re = np.mean(np.abs(nn_preds - y) / np.max(y))
print(f"\nDeep Learning Model — Avg r = {r:.4f}, Avg RE = {re:.4f}")

evaluate_by_group(data, nn_preds, y)
find_worst_group(data, nn_preds, y)



Deep Learning Model — Avg r = 0.1138, Avg RE = 0.1035

Group: Gender
('Female',): r = 0.1090, RE = 0.1061
('Male',): r = 0.0996, RE = 0.1002

Group: Race
('African American',): r = 0.3119, RE = 0.0839
('Hispanic',): r = 0.0531, RE = 0.1203
('White',): r = 0.0815, RE = 0.0997

Group: Gender x Race
('Female', 'African American'): r = 0.1776, RE = 0.0826
('Female', 'Hispanic'): r = 0.0992, RE = 0.1349
('Female', 'White'): r = 0.1607, RE = 0.1005
('Male', 'African American'): r = 0.4614, RE = 0.0864
('Male', 'Hispanic'): r = -0.0412, RE = 0.1073
('Male', 'White'): r = 0.1938, RE = 0.0985

Worst performing group: ('Female', 'Hispanic') with RE = 0.1349


In my analysis, I used both a Random Forest (tree-based) model and a simple feedforward neural network (deep learning) to estimate depression severity based on behavioral features. The Random Forest model consistently performed better than the neural network across different settings — it gave higher Pearson correlation values and lower relative errors. Based on the results, I found that the tree-based model was more effective for this dataset, probably because it handles smaller datasets and nonlinear patterns more reliably without needing much fine-tuning.



In [15]:
from sklearn.feature_selection import f_classif, f_regression

def top_feats(X, y, k=10, func=f_regression):
    sel = SelectKBest(score_func=func, k=k)
    sel.fit(X, y)
    return X.columns[sel.get_support()].tolist()

phq_feats = top_feats(X_scaled_df, data["Overall"], k=10, func=f_regression)
gender_feats = top_feats(X_scaled_df, data["Gender"], k=10, func=f_classif)
race_feats = top_feats(X_scaled_df, data["Race"], k=10, func=f_classif)

print("\nTop features for PHQ-8 (Overall):", phq_feats)
print("Top features for Gender:", gender_feats)
print("Top features for Race:", race_feats)

print("\nOverlap with Gender:", set(phq_feats) & set(gender_feats))
print("Overlap with Race:", set(phq_feats) & set(race_feats))



Top features for PHQ-8 (Overall): ['filler%', 'speaker_balance', 'total_word_count', 'word_length_2', 'word_length_3', 'word_length_4', 'word_length_6', 'word_length_7', '8+ words', 'maximum_sentence_sentiment']
Top features for Gender: ['filler%', 'speaker_balance', 'total_word_count', 'word_length_2', 'word_length_4', 'word_length_5', 'word_length_6', '8+ words', 'average_sentence_sentiment', 'maximum_sentence_sentiment']
Top features for Race: ['speaker_balance', 'total_word_count', 'word_length_2', 'word_length_4', 'word_length_5', 'word_length_6', 'word_length_7', '8+ words', 'minimum_sentence_sentiment', 'maximum_sentence_sentiment']

Overlap with Gender: {'total_word_count', 'speaker_balance', 'word_length_4', 'word_length_6', 'filler%', '8+ words', 'word_length_2', 'maximum_sentence_sentiment'}
Overlap with Race: {'total_word_count', 'speaker_balance', 'word_length_4', 'word_length_6', '8+ words', 'word_length_2', 'word_length_7', 'maximum_sentence_sentiment'}
