# Import Library 

In [1]:
import pandas as pd
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Read Data

In [2]:
train_df = pd.read_csv('../data/train_clean.csv')
test_df = pd.read_csv('../data/test_clean.csv')

# Data Split

In [3]:
X = train_df.drop(["Depression",'id'],axis=1)
y = train_df['Depression']

In [4]:
x_train,x_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=42)

# Searching Best Parameter

In [5]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

In [6]:
xgb_model = XGBClassifier()

In [7]:
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')

In [8]:
grid_search.fit(x_train, y_train)

In [9]:
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.7}
Best score:  0.9384207779474156


# Model Training

In [10]:
best_params = grid_search.best_params_
xgb_best = XGBClassifier(**best_params)

In [11]:
xgb_best.fit(x_train, y_train)

In [12]:
y_pred = xgb_best.predict(x_test)

In [13]:
with open('xgb_model.pkl', 'wb') as file:
    pickle.dump(xgb_best, file)

# Model Evaluation

In [14]:
print("Accuracy on test set: ", accuracy_score(y_test, y_pred))

Accuracy on test set:  0.9382264083881287


In [15]:
target_names = ['Depressed', 'Not Depressed', ]
print(classification_report(y_test, y_pred, target_names=target_names))

               precision    recall  f1-score   support

    Depressed       0.96      0.96      0.96     23017
Not Depressed       0.83      0.83      0.83      5118

     accuracy                           0.94     28135
    macro avg       0.90      0.90      0.90     28135
 weighted avg       0.94      0.94      0.94     28135



# Submission XGB

In [16]:
x_test = test_df.drop(['id'], axis=1)

In [17]:
y_pred = xgb_best.predict(x_test)

In [18]:
submission_data = pd.DataFrame({'id':test_df['id'],'Depression':y_pred})

In [19]:
submission_data.to_csv('../data/xgb_pred.csv', index=False)