In [1]:
import re
import string

import joblib
import pandas as pd
import plotly.express as px
import sklearn.metrics as m
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from plotly import graph_objects as go
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Importing the dataset

In [2]:
df = pd.read_csv('dataset/dataset.csv', names=['text', 'is_depression'], skiprows=1)
df.head()

Unnamed: 0,text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [3]:
depression_count = df.is_depression.value_counts()
fig = px.bar(depression_count, x=depression_count.index, y=depression_count.values, color=depression_count.index,
             title='Data is balanced', height=600, width=600)
fig.show()

# Data Preprocessing pipleline

In [4]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)


def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_sentence = [lemmatizer.lemmatize(w) for w in word_tokens]
    return ' '.join(lemmatized_sentence)


def preprocess(text):
    text = text.lower()
    text = remove_url(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_punct(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

In [5]:
df['text_cleaned'] = df['text'].apply(lambda x: preprocess(x))
X = df['text_cleaned']
y = df['is_depression']

# model pipeline with grid search

In [6]:
log_reg_params = {
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'clf__solver': ['liblinear']
}

nb_params = {
    'clf__alpha': [0.1, 1, 10, 100]
}

svc_params = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__kernel': ['linear', 'rbf']
}

rf_params = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [5, 10, 15, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}


ada_params = {
    'clf__n_estimators': [100, 200, 300],
    'clf__learning_rate': [0.05, 0.1, 0.2, 0.5]
}


In [7]:
log_reg_pipe_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression())
])

log_reg_pipe_2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

svc_pipe_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SVC())
])

svc_pipe_2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', SVC())
])

rf_pipe_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', RandomForestClassifier())
])

rf_pipe_2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

ada_pipe_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', AdaBoostClassifier())
])

ada_pipe_2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', AdaBoostClassifier())
])

# grid search

In [8]:
log_reg_gs_1 = GridSearchCV(log_reg_pipe_1, log_reg_params, cv=3, n_jobs=-1, verbose=1)
log_reg_gs_2 = GridSearchCV(log_reg_pipe_2, log_reg_params, cv=3, n_jobs=-1, verbose=1)
svc_gs_1 = GridSearchCV(svc_pipe_1, svc_params, cv=3, n_jobs=-1, verbose=1)
svc_gs_2 = GridSearchCV(svc_pipe_2, svc_params, cv=3, n_jobs=-1, verbose=1)
rf_gs_1 = GridSearchCV(rf_pipe_1, rf_params, cv=3, n_jobs=-1, verbose=1)
rf_gs_2 = GridSearchCV(rf_pipe_2, rf_params, cv=3, n_jobs=-1, verbose=1)
ada_gs_1 = GridSearchCV(ada_pipe_1, ada_params, cv=3, n_jobs=-1, verbose=1)
ada_gs_2 = GridSearchCV(ada_pipe_2, ada_params, cv=3, n_jobs=-1, verbose=1)

# split data and train

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Xtrain shape: {Xtrain.shape}')
print(f'Xtest shape: {Xtest.shape}')
print(f'ytrain shape: {ytrain.shape}')
print(f'ytest shape: {ytest.shape}')
print("LogisticRegression with CountVectorizer")
log_reg_gs_1.fit(Xtrain, ytrain)
print("LogisticRegression with TfidfVectorizer")
log_reg_gs_2.fit(Xtrain, ytrain)
print("SVC with CountVectorizer")
svc_gs_1.fit(Xtrain, ytrain)
print("SVC with TfidfVectorizer")
svc_gs_2.fit(Xtrain, ytrain)
print("RandomForest with CountVectorizer")
rf_gs_1.fit(Xtrain, ytrain)
print("RandomForest with TfidfVectorizer")
rf_gs_2.fit(Xtrain, ytrain)
print("AdaBoost with CountVectorizer")
ada_gs_1.fit(Xtrain, ytrain)
print("AdaBoost with TfidfVectorizer")
ada_gs_2.fit(Xtrain, ytrain)

Xtrain shape: (6184,)
Xtest shape: (1547,)
ytrain shape: (6184,)
ytest shape: (1547,)
LogisticRegression with CountVectorizer
Fitting 3 folds for each of 14 candidates, totalling 42 fits
LogisticRegression with TfidfVectorizer
Fitting 3 folds for each of 14 candidates, totalling 42 fits
SVC with CountVectorizer
Fitting 3 folds for each of 8 candidates, totalling 24 fits
SVC with TfidfVectorizer
Fitting 3 folds for each of 8 candidates, totalling 24 fits
RandomForest with CountVectorizer
Fitting 3 folds for each of 108 candidates, totalling 324 fits
RandomForest with TfidfVectorizer
Fitting 3 folds for each of 108 candidates, totalling 324 fits
AdaBoost with CountVectorizer
Fitting 3 folds for each of 12 candidates, totalling 36 fits
AdaBoost with TfidfVectorizer
Fitting 3 folds for each of 12 candidates, totalling 36 fits


# best model configuration for each model to be used in voting classifier

In [10]:
print(f'LogisticRegression with CountVectorizer: {log_reg_gs_1.best_params_}')
print(f'LogisticRegression with TfidfVectorizer: {log_reg_gs_2.best_params_}')
print(f'SVC with CountVectorizer: {svc_gs_1.best_params_}')
print(f'SVC with TfidfVectorizer: {svc_gs_2.best_params_}')
print(f'RandomForest with CountVectorizer: {rf_gs_1.best_params_}')
print(f'RandomForest with TfidfVectorizer: {rf_gs_2.best_params_}')
print(f'AdaBoost with CountVectorizer: {ada_gs_1.best_params_}')
print(f'AdaBoost with TfidfVectorizer: {ada_gs_2.best_params_}')

LogisticRegression with CountVectorizer: {'clf__C': 1, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
LogisticRegression with TfidfVectorizer: {'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
SVC with CountVectorizer: {'clf__C': 10, 'clf__kernel': 'rbf'}
SVC with TfidfVectorizer: {'clf__C': 1, 'clf__kernel': 'linear'}
RandomForest with CountVectorizer: {'clf__max_depth': 20, 'clf__min_samples_leaf': 4, 'clf__min_samples_split': 2, 'clf__n_estimators': 300}
RandomForest with TfidfVectorizer: {'clf__max_depth': 20, 'clf__min_samples_leaf': 4, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}
AdaBoost with CountVectorizer: {'clf__learning_rate': 0.2, 'clf__n_estimators': 300}
AdaBoost with TfidfVectorizer: {'clf__learning_rate': 0.5, 'clf__n_estimators': 100}



# voting classifier

In [11]:
voting_pipe_1 = VotingClassifier([
    ('log_reg_1', log_reg_gs_1.best_estimator_),
    ('svc_1', svc_gs_1.best_estimator_),
    ('rf_1', rf_gs_1.best_estimator_),
    ('ada_1', ada_gs_1.best_estimator_)
])

voting_pipe_2 = VotingClassifier([
    ('log_reg_2', log_reg_gs_2.best_estimator_),
    ('svc_2', svc_gs_2.best_estimator_),
    ('rf_2', rf_gs_2.best_estimator_),
    ('ada_2', ada_gs_2.best_estimator_)
])

In [12]:
voting_pipe_1.fit(Xtrain, ytrain)
print(f'voting_pipe_1 score: {voting_pipe_1.score(Xtest, ytest)}')

voting_pipe_1 score: 0.9547511312217195


In [13]:
voting_pipe_2.fit(Xtrain, ytrain)
print(f'voting_pipe_2 score: {voting_pipe_2.score(Xtest, ytest)}')


voting_pipe_2 score: 0.9547511312217195


# evaluate

In [14]:
def evaluate(model, Xtest, ytest):
    return {'model': model.estimator.get_params()['clf'].__class__.__name__,
            'accuracy': model.score(Xtest, ytest),
            'f1_score': m.f1_score(ytest, model.predict(Xtest), average='weighted'),
            'precision_score': m.precision_score(ytest, model.predict(Xtest), average='weighted'),
            'recall_score': m.recall_score(ytest, model.predict(Xtest), average='weighted'),
            'roc_auc_score': m.roc_auc_score(ytest, model.predict(Xtest), average='weighted'), }

In [15]:
results = []
results.append(evaluate(log_reg_gs_1, Xtest, ytest))
results.append(evaluate(log_reg_gs_2, Xtest, ytest))
results.append(evaluate(svc_gs_1, Xtest, ytest))
results.append(evaluate(svc_gs_2, Xtest, ytest))
results.append(evaluate(rf_gs_1, Xtest, ytest))
results.append(evaluate(rf_gs_2, Xtest, ytest))
results.append(evaluate(ada_gs_1, Xtest, ytest))
results.append(evaluate(ada_gs_2, Xtest, ytest))
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model,accuracy,f1_score,precision_score,recall_score,roc_auc_score
0,LogisticRegression,0.95863,0.958578,0.96023,0.95863,0.958258
1,LogisticRegression,0.955398,0.955386,0.955606,0.955398,0.955256
2,SVC,0.961215,0.961163,0.962971,0.961215,0.960828
3,SVC,0.955398,0.955384,0.955659,0.955398,0.95524
4,RandomForestClassifier,0.885585,0.883972,0.905677,0.885585,0.884194
5,RandomForestClassifier,0.893342,0.892154,0.909171,0.893342,0.892111
6,AdaBoostClassifier,0.952812,0.952746,0.954603,0.952812,0.952416
7,AdaBoostClassifier,0.951519,0.951451,0.953304,0.951519,0.951123


In [16]:
results_df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision_score,recall_score,roc_auc_score
2,SVC,0.961215,0.961163,0.962971,0.961215,0.960828
0,LogisticRegression,0.95863,0.958578,0.96023,0.95863,0.958258
1,LogisticRegression,0.955398,0.955386,0.955606,0.955398,0.955256
3,SVC,0.955398,0.955384,0.955659,0.955398,0.95524
6,AdaBoostClassifier,0.952812,0.952746,0.954603,0.952812,0.952416
7,AdaBoostClassifier,0.951519,0.951451,0.953304,0.951519,0.951123
5,RandomForestClassifier,0.893342,0.892154,0.909171,0.893342,0.892111
4,RandomForestClassifier,0.885585,0.883972,0.905677,0.885585,0.884194


In [17]:
# save models
path = 'models/'
joblib.dump(log_reg_gs_1, path + f'log_reg_cv_model.joblib')
joblib.dump(log_reg_gs_2, path + f'log_reg_tv_model.joblib')
joblib.dump(svc_gs_1, path + f'svc_cv_model.joblib')
joblib.dump(svc_gs_2, path + f'svc_tv_model.joblib')
joblib.dump(rf_gs_1, path + f'rf_cv_model.joblib')
joblib.dump(ada_gs_1, path + f'ada_cv_model.joblib')
joblib.dump(ada_gs_2, path + f'ada_tv_model.joblib')
print('models saved')

models saved


# grid search results and plotly visualization

In [18]:
def visualize_model_grid_search_results(model, model_name, param):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=model.cv_results_[f'param_{param}'], y=model.cv_results_['mean_test_score'],
                             mode='lines+markers', name='mean_test_score'))
    fig.update_layout(title=f'{model_name} {param} grid search results',
                      xaxis_title=f'{param}',
                      yaxis_title='score')
    fig.show()

In [19]:
visualize_model_grid_search_results(log_reg_gs_1, 'LogisticRegression', 'clf__C')

In [20]:
visualize_model_grid_search_results(log_reg_gs_2, 'LogisticRegression', 'clf__C')

In [21]:
visualize_model_grid_search_results(svc_gs_1, 'SVC', 'clf__C')

In [22]:
visualize_model_grid_search_results(svc_gs_2, 'SVC', 'clf__C')

In [23]:
visualize_model_grid_search_results(rf_gs_1, 'RandomForest', 'clf__n_estimators')

In [24]:
visualize_model_grid_search_results(rf_gs_2, 'RandomForest', 'clf__n_estimators')

In [25]:
visualize_model_grid_search_results(ada_gs_1, 'AdaBoost', 'clf__n_estimators')

In [26]:
visualize_model_grid_search_results(ada_gs_2, 'AdaBoost', 'clf__n_estimators')

In [27]:
# save results
results_df.to_csv('results.csv')
print('results saved')

results saved


In [28]:
results_df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision_score,recall_score,roc_auc_score
2,SVC,0.961215,0.961163,0.962971,0.961215,0.960828
0,LogisticRegression,0.95863,0.958578,0.96023,0.95863,0.958258
1,LogisticRegression,0.955398,0.955386,0.955606,0.955398,0.955256
3,SVC,0.955398,0.955384,0.955659,0.955398,0.95524
6,AdaBoostClassifier,0.952812,0.952746,0.954603,0.952812,0.952416
7,AdaBoostClassifier,0.951519,0.951451,0.953304,0.951519,0.951123
5,RandomForestClassifier,0.893342,0.892154,0.909171,0.893342,0.892111
4,RandomForestClassifier,0.885585,0.883972,0.905677,0.885585,0.884194
