In [93]:
import pandas as pd 
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool, cv


### Load Data

In [33]:
path_data = Path("../data")
train = pd.read_csv(path_data / "train.csv")
test = pd.read_csv(path_data / "test.csv")

In [3]:
train.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


Work/Study Hours
10.0    9450
11.0    8555
9.0     8315
0.0     8132
12.0    7657
2.0     7083
6.0     7029
7.0     6646
1.0     6525
3.0     6261
5.0     6118
4.0     6079
8.0     5950
Name: count, dtype: int64

### Prepare Data

In [94]:
numerical_columns = ['Age', 'Work/Study Hours']
quantized_columns = ['CGPA']
categorical_columns = [
    'Gender', 'Working Professional or Student', 'Academic Pressure',
    'Work Pressure', 'Study Satisfaction', 'Job Satisfaction',
    'Have you ever had suicidal thoughts ?', 'Financial Stress',
    'Family History of Mental Illness'
]
text_columns = [
    'Name', 'City', 'Profession', 'Sleep Duration',
    'Dietary Habits', 'Degree'
]

# Drop duplicates
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

train_preprocessed = train.copy()
test_preprocessed = test.copy()

# Drop 'id' column
train_preprocessed.drop(columns='id', inplace=True)
test_preprocessed.drop(columns='id', inplace=True)

# Fill missing values
for col in numerical_columns:
    train_preprocessed[col] = train_preprocessed[col].fillna(train_preprocessed[col].median())
    test_preprocessed[col] = test_preprocessed[col].fillna(test_preprocessed[col].median())

for col in quantized_columns:
    train_preprocessed[col] = train_preprocessed[col].fillna(train_preprocessed[col].median())
    test_preprocessed[col] = test_preprocessed[col].fillna(test_preprocessed[col].median())

for col in categorical_columns:
    train_preprocessed[col] = train_preprocessed[col].fillna(train_preprocessed[col].mode()[0])
    test_preprocessed[col] = test_preprocessed[col].fillna(test_preprocessed[col].mode()[0])

    # Encode categorical columns
    le = LabelEncoder()
    train_preprocessed[col] = le.fit_transform(train_preprocessed[col])
    test_preprocessed[col] = le.transform(test_preprocessed[col])

    # Convert to 'category' data type
    train_preprocessed[col] = train_preprocessed[col].astype('category')
    test_preprocessed[col] = test_preprocessed[col].astype('category')

for col in text_columns:
    train_preprocessed[col] = train_preprocessed[col].fillna('missing')
    test_preprocessed[col] = test_preprocessed[col].fillna('missing')

# Transform data types
for col in numerical_columns:
    train_preprocessed[col] = train_preprocessed[col].astype(int)
    test_preprocessed[col] = test_preprocessed[col].astype(int)

# Encode text columns
for col in text_columns:
    le = LabelEncoder()
    le.fit(pd.concat([train_preprocessed[col], test_preprocessed[col]]))

    train_preprocessed[col] = le.transform(train_preprocessed[col])
    test_preprocessed[col] = le.transform(test_preprocessed[col])

# Quantize 'CGPA'
train_preprocessed['CGPA'] = pd.cut(
    train_preprocessed['CGPA'].astype(float),
    bins=3,
    labels=['Low', 'Medium', 'High'],
    duplicates='drop'
)
test_preprocessed['CGPA'] = pd.cut(
    test_preprocessed['CGPA'].astype(float),
    bins=3,
    labels=['Low', 'Medium', 'High'],
    duplicates='drop'
)

In [89]:
train_preprocessed.iloc[6]

Name                                        391
Gender                                        1
Age                                          47
City                                        108
Working Professional or Student               1
Profession                                   14
Academic Pressure                             2
Work Pressure                                 4
CGPA                                     Medium
Study Satisfaction                            3
Job Satisfaction                              1
Sleep Duration                               24
Dietary Habits                               22
Degree                                      116
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              6
Financial Stress                              1
Family History of Mental Illness              0
Depression                                    0
Name: 6, dtype: object

In [None]:
# Split data into train and validation
X = train_preprocessed.drop(columns=['Depression'])
y = train_preprocessed['Depression']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Pool object
X_train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_columns + text_columns + quantized_columns)
X_val_pool = Pool(data=X_val, label=y_val, cat_features=categorical_columns + text_columns + quantized_columns)

# Prepare cross-validation
params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'verbose': 100,
    'random_seed': 42
}

cv_results = cv(
    params=params,
    pool=X_train_pool,
    fold_count=5,
    plot=True,
    verbose=False
)

# Train model
model = CatBoostClassifier(**params)

model.fit(X_train_pool, eval_set=X_val_pool)

# Predict

X_test = test_preprocessed
X_test_pool = Pool(data=X_test, cat_features=categorical_columns + text_columns + quantized_columns)

y_test_pred = model.predict(X_test_pool)
y_test_pred_proba = model.predict_proba(X_test_pool)[:, 1]

# Save predictions
submission = pd.read_csv(path_data / "sample_submission.csv")

submission['Depression'] = y_test_pred_proba

submission.to_csv(path_data / "submission.csv", index=False)

submission.head()




MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.9742180333
bestIteration = 290

Training on fold [1/5]
