In [19]:
from numba import cuda

def is_gpu_available():
    try:
        return cuda.is_available()
    except:
        return False

gpu_available = is_gpu_available()
print(f"GPU is available: {gpu_available}")

GPU is available: True


In [20]:
if gpu_available:
    import cudf as pd
    from cuml.preprocessing import LabelEncoder, StandardScaler
    from cuml.model_selection import train_test_split
else:
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder, StandardScaler
    from sklearn.model_selection import train_test_split

# Read Data
use cudf to read data into GPU

- delete unuseful colums(id, Name, City)
- Combine[(Academic Pressure, Work Pressure), (Working professional or student, Profession), (Study Satisfaction, Job Satisfaction)]
- Normalize or mapping

In [21]:
df = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")


# drop 'id' , 'Name' (Not revelant), 'city'
df.drop(['id', 'Name'], axis=1, inplace=True)
df.drop(['City'], axis=1, inplace=True)
# combine 'working professional or student' and 'Profession'
df['Working Professional or Student'] = df['Profession'].fillna(df['Working Professional or Student'])
df.drop(['Profession'], axis=1, inplace=True)
# combine 'Academic Pressure' and 'Work Pressure'
df['Academic Pressure'] = df['Work Pressure'].fillna(df['Academic Pressure'])
df.drop(['Work Pressure'], axis=1, inplace=True)
# combine 'Study Satisfaction' and 'Job Satisfaction'
df['Study Satisfaction'] = df['Job Satisfaction'].fillna(df['Study Satisfaction'])
df.drop(['Job Satisfaction'], axis=1, inplace=True)
# most of 'CGPA' leave null, drop it
df.drop(['CGPA'], axis=1, inplace=True)

# one - hot encoder
df['Working Professional or Student'] = LabelEncoder().fit_transform(df['Working Professional or Student'])
df['Degree'] = LabelEncoder().fit_transform(df['Degree'])
df['Academic Pressure'] = LabelEncoder().fit_transform(df['Academic Pressure'])


# bool
df['Have you ever had suicidal thoughts ?'] = df['Have you ever had suicidal thoughts ?'].map({'Yes': 1, 'No': 0})
df['Family History of Mental Illness'] = df['Family History of Mental Illness'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Sleep Duration'] = df['Sleep Duration'].map({'Less than 5 hours': 0, '5-6 hours': 1, '7-8 hours': 2, 'More than 8 hours': 3})
df['Dietary Habits'] = df['Dietary Habits'].map({'Healthy': 0, 'Moderate': 1, 'Unhealthy': 2})

# normalize all constant value
cols_to_scale = [
    'Age', 'Academic Pressure', 'Study Satisfaction',
    'Work/Study Hours', 'Financial Stress'
]
scaler = StandardScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])


df.head(10)

Unnamed: 0,Gender,Age,Working Professional or Student,Academic Pressure,Study Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,0.69536,10,1.407041,-0.689253,3,0,33,0,-1.363057,-0.699607,0,0
1,1,-1.161867,55,0.693745,0.022383,0,2,63,1,0.193928,0.007793,0,1
2,1,-0.596624,54,1.407041,-0.689253,1,0,21,1,-0.844062,-1.407006,0,1
3,1,-1.484863,55,1.407041,-1.400889,0,1,28,1,0.972421,-1.407006,1,1
4,0,-0.838871,9,-1.446144,-1.400889,1,2,28,1,0.712923,0.715193,1,0
5,0,1.50285,26,-0.732848,1.445655,1,0,82,0,0.193928,1.422593,0,0
6,1,0.533862,11,1.407041,-0.689253,2,1,83,0,-0.06557,-0.699607,0,0
7,1,-0.192879,55,-0.019551,0.734019,2,2,21,0,0.972421,0.007793,1,0
8,0,-1.323365,54,-0.732848,1.445655,1,1,36,0,-0.844062,-0.699607,1,0
9,0,0.130117,22,0.693745,-1.400889,1,0,84,1,0.193928,-0.699607,1,0


In [22]:
# delete any row with null value
df = df.dropna()

X = df.drop(columns=['Depression'])
y = df['Depression']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def evaluate_model(y_true, y_pred):

    def to_cpu_array(x):
        if hasattr(x, 'to_numpy'):  # cuDF Series
            return x.to_numpy()
        # elif hasattr(x, 'get'):     # cupy array
        #     return x.get()
        # elif hasattr(x, 'cpu'):     # torch tensor
        #     return x.cpu().numpy()
        # elif isinstance(x, (list, np.ndarray)):
        #     return np.array(x)
        else:
            return x  # fallback

    y_true = to_cpu_array(y_true)
    y_pred = to_cpu_array(y_pred)

    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

# Machine Learning Method

In [24]:
if gpu_available:
    from cuml.linear_model import LogisticRegression
    from cuml.ensemble  import RandomForestClassifier
    from cuml.svm import SVC
    from cuml.naive_bayes import GaussianNB
    from cuml.neighbors import KNeighborsClassifier
else:
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsClassifier

# Logistic Regression
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("-" * 50 + "Losgistic Regression" + "-" * 50)
evaluate_model(y_test, y_pred)


# Random Forest
model = RandomForestClassifier(criterion="log_loss",
                               n_estimators=120, random_state=102)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("-" * 50 + "Random Forest" + "-" * 50)
evaluate_model(y_test, y_pred)


# SVM
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("-" * 50 + "SVM" + "-" * 50)
evaluate_model(y_test, y_pred)

# To-do: still debugging
# # Naive Bayes
# model = GaussianNB()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print("-" * 50 + "Naive Bayes" + "-" * 50)
# evaluate_model(y_test, y_pred)

# KNN
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("-" * 50 + "KNN" + "-" * 50)
evaluate_model(y_test, y_pred)



--------------------------------------------------Losgistic Regression--------------------------------------------------
Accuracy: 0.9292803528867704
Confusion Matrix:
 [[22177   828]
 [ 1160  3946]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96     23005
           1       0.83      0.77      0.80      5106

    accuracy                           0.93     28111
   macro avg       0.89      0.87      0.88     28111
weighted avg       0.93      0.93      0.93     28111



  return init_func(self, *args, **kwargs)


--------------------------------------------------Random Forest--------------------------------------------------
Accuracy: 0.9366084450926684
Confusion Matrix:
 [[22236   769]
 [ 1013  4093]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96     23005
           1       0.84      0.80      0.82      5106

    accuracy                           0.94     28111
   macro avg       0.90      0.88      0.89     28111
weighted avg       0.94      0.94      0.94     28111

--------------------------------------------------SVM--------------------------------------------------
Accuracy: 0.9283198747821138
Confusion Matrix:
 [[22290   715]
 [ 1300  3806]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96     23005
           1       0.84      0.75      0.79      5106

    accuracy                           0.93     28111
   macro avg       0.89      0.

In [25]:
# XGBoost
from xgboost import XGBClassifier
from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

counter = Counter(y_train.to_numpy())
scale = counter[0] / counter[1]
print(scale)
tree_method = 'gpu_hist' if gpu_available else 'hist'
predictor = 'gpu_predictor' if gpu_available else 'cpu_predictor'

model = XGBClassifier(scale_pos_weight=scale * 0.4, n_estimators=150,
    learning_rate=0.05, max_depth=20,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='aucpr',
    use_label_encoder=False,
    random_state=42,
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("-" * 50 + "XGBoost" + "-" * 50)
evaluate_model(y_test, y_pred)


4.505679592636114
--------------------------------------------------XGBoost--------------------------------------------------
Accuracy: 0.9366440183558038
Confusion Matrix:
 [[22117   888]
 [  893  4213]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96     23005
           1       0.83      0.83      0.83      5106

    accuracy                           0.94     28111
   macro avg       0.89      0.89      0.89     28111
weighted avg       0.94      0.94      0.94     28111



# Neural Network

In [26]:
import torch
from torch.utils import dlpack
import cudf

def cudf_to_torch(df: cudf.DataFrame) ->torch.Tensor:
    dlpack_capsule = df.astype("float32").to_dlpack() 
    return torch_dlpack.from_dlpack(dlpack_capsule)
    
X_train = cudf_to_torch(X_train)
X_test = cudf_to_torch(X_test)
y_train = cudf_to_torch(y_train)
y_test = cudf_to_torch(y_test)

from torch.utils.data import TensorDataset, DataLoader

train_ds = TensorDataset(X_train, y_train)
test_ds  = TensorDataset(X_test,  y_test)

batch_size = 256
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

In [30]:
# MLP
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
from tqdm import tqdm

class MLP(nn.Module):
    def __init__(self, in_features, hidden=[256, 128, 64], dropout=0.3):
        super().__init__()
        layers = []
        dim = in_features
        for h in hidden:
            layers += [nn.Linear(dim, h), nn.ReLU(), nn.Dropout(dropout)] # Linear → ReLU → Dropout
            dim = h
        layers.append(nn.Linear(dim, 1)) # Output logic 1 dim         
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(1)


device = torch.device("cuda") if gpu_available else torch.device("cpu")
model  = MLP(X_train.shape[1]).to(device)

# pos_weight = N_neg / N_pos
pos_weight = torch.tensor(
    (y_train == 0).sum() / (y_train == 1).sum(),
    dtype=torch.float32, device=device
)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=3, verbose=True
)

best_loss, best_state = float("inf") , None
epochs = 10
for epoch in range(1, epochs + 1):
    # --- train ---
    model.train()
    running_loss = 0.0
    for xb, yb in tqdm(train_loader, desc=f"Epoch {epoch}", leave=False):      
        optimizer.zero_grad(set_to_none=True)
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"[Epoch {epoch}] train_loss = {epoch_loss:.4f}")
    scheduler.step(epoch_loss)

    if epoch_loss < best_loss:
        best_loss, best_state = epoch_loss, model.state_dict()

model.load_state_dict(best_state)
model.eval()
with torch.no_grad():
    y_prob = torch.sigmoid(torch.cat([model(xb) for xb, _ in test_loader])).cpu().numpy()

# --- 3.1 计算最优阈值 (最大 F1) ---
from sklearn.metrics import precision_recall_curve, classification_report
prec, rec, thr = precision_recall_curve(y_test.cpu().numpy(), y_prob)
f1   = 2 * prec * rec / (prec + rec + 1e-9)
best_thr = thr[f1.argmax()]
print(f"best_thr = {best_thr:.4f},  best F1 = {f1.max():.4f}")

y_pred = (y_prob >= best_thr).astype(int)
evaluate_model(y_test.cpu().numpy(), y_pred)

  pos_weight = torch.tensor(
                                                           

[Epoch 1] train_loss = 0.7059


                                                           

[Epoch 2] train_loss = 0.4406


                                                           

[Epoch 3] train_loss = 0.4089


                                                           

[Epoch 4] train_loss = 0.3989


                                                           

[Epoch 5] train_loss = 0.3918


                                                           

[Epoch 6] train_loss = 0.3873


                                                           

[Epoch 7] train_loss = 0.3846


                                                           

[Epoch 8] train_loss = 0.3841


                                                           

[Epoch 9] train_loss = 0.3820


                                                            

[Epoch 10] train_loss = 0.3802
best_thr = 0.7494,  best F1 = 0.8134
Accuracy: 0.9292447796236348
Confusion Matrix:
 [[21788  1217]
 [  772  4334]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.95      0.96     23005
         1.0       0.78      0.85      0.81      5106

    accuracy                           0.93     28111
   macro avg       0.87      0.90      0.88     28111
weighted avg       0.93      0.93      0.93     28111

