<a href="https://colab.research.google.com/github/tariqshaban/suicide-detection/blob/master/Suicide%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Dependencies

In [None]:
# Display generic output messages
!pip install colorama

# Huggingface related libraries
!pip install transformers datasets evaluate

# Download assets from the GitHub repository
!apt install subversion
!svn checkout https://github.com/tariqshaban/suicide-detection/trunk/assets

import json
import evaluate
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import urllib
import tensorflow as tf

from colorama import Fore, Back, Style

from datasets import Dataset
from datasets import DatasetDict
from datasets import load_dataset

from IPython.display import Image, display, clear_output

from sklearn.metrics import confusion_matrix

from tensorflow.keras.callbacks import History, EarlyStopping

from transformers import AutoTokenizer
from transformers import create_optimizer
from transformers import DataCollatorWithPadding
from transformers import pipeline
from transformers import TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback

from typing import Dict


# Acquire the dataset from its LFS source

# Obtain LFS file metadata
with open('assets/dataset/Suicide_Detection.csv') as file:
    lines = [line.rstrip() for line in file]
sha = lines[1][lines[1].index(':')+1:]
size = lines[2][lines[2].index(' ')+1:]

# Acquire direct download link
payload =  f'''
{{
    "operation": "download", 
    "transfer": ["basic"], 
    "objects": [
        {{"oid": "{sha}", "size": {size}}}
    ]
}}
'''
curl = f'''
curl -X POST \
-H 'Accept: application/vnd.git-lfs+json' \
-H 'Content-type: application/json' \
-d '{payload}' \
https://github.com/tariqshaban/suicide-detection.git/info/lfs/objects/batch
'''
response = json.loads(os.popen(curl).read())
file_url = response['objects'][0]['actions']['download']['href']

# Replace LFS metadata with the actual file
!rm assets/dataset/Suicide_Detection.csv
urllib.request.urlretrieve(file_url, 'assets/dataset/Suicide_Detection.csv')


clear_output()
print(Fore.GREEN + u'\u2713 ' + 'Successfully downloaded dependencies.')    
print(Style.RESET_ALL)

# Defining Constants

In [None]:
DATASET_FILENAME = './assets/dataset/Suicide_Detection.csv'
PLOTS_OUTPUT = './assets/output/images'
MODEL_OUTPUT = './assets/output/model'

ID2LABEL = {0: 'non-suicide', 1: 'suicide'}
LABEL2ID = {'non-suicide': 0, 'suicide': 1}

LABELS = list(LABEL2ID.keys())

SAMPLE_FRACTION = 1

VALIDATION_SIZE = 0.01
TEST_SIZE = 0.005

BATCH_SIZE = 2
EPOCHS = 300
EARLY_STOPPING_PATIENCE = 30
EARLY_STOPPING_MIN_DELTA = 0.001

# Helper Methods

### Get Tokenizor

In [None]:
def get_tokenizer() -> any:
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    return tokenizer

### Get Pretrained Model

In [None]:
def get_pretrained_model() -> any:
    model = TFAutoModelForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=2, id2label=ID2LABEL, label2id=LABEL2ID
    )

    return model

### Get Trained Model Pipeline

In [None]:
def get_trained_pipeline() -> any:
    trained_pipeline = pipeline('sentiment-analysis', model=MODEL_OUTPUT, tokenizer=MODEL_OUTPUT)

    return trained_pipeline

### Prime Dataset

In [None]:
def prime_dataset(
        tokenizer: any,
) -> [DatasetDict, pd.DataFrame]:
    def preprocess_function(examples) -> any:
        examples['label'] = [LABEL2ID.get(e, e) for e in examples['label']]
        return tokenizer(examples['text'], truncation=True)

    df = pd.read_csv(DATASET_FILENAME)
    df = df.sample(frac=SAMPLE_FRACTION)

    df_train_valid = df.sample(frac=1 - TEST_SIZE)
    df_test = df.drop(df_train_valid.index)

    suicide_detection = Dataset.from_pandas(df_train_valid)

    suicide_detection = suicide_detection.rename_column('class', 'label')
    suicide_detection = suicide_detection.remove_columns('Unnamed: 0')
    suicide_detection = suicide_detection.train_test_split(test_size=VALIDATION_SIZE / (1 - TEST_SIZE))

    tokenized_suicide_detection = suicide_detection.map(preprocess_function, batched=True)

    return [tokenized_suicide_detection, df_test]

### Visualize Model Results

In [None]:
def visualize_model(
        test_dataset: pd.DataFrame,
        fitted_model: History,
):
    trained_pipeline = get_trained_pipeline()

    plt.plot(fitted_model.history['loss'])
    plt.plot(fitted_model.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Valid'], loc='upper left')
    plt.savefig(f'{PLOTS_OUTPUT}/loss_history.png')
    plt.show()

    y_predict = [prediction['label'] for prediction in
                 trained_pipeline(test_dataset['text'].to_list(), truncation=True, max_length=4096)]
    y_true = test_dataset['class'].to_list()

    ax = sns.heatmap(confusion_matrix(y_true, y_predict, labels=LABELS), annot=True, cmap='Blues', fmt='g')
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Actual Values')
    ax.xaxis.set_ticklabels(LABELS)
    ax.yaxis.set_ticklabels(LABELS)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.savefig(f'{PLOTS_OUTPUT}/confusion_matrix.png')
    plt.show()

### Train Model

In [None]:
def train_model(
        tokenizer: any,
        tokenized_dataset: DatasetDict,
        test_dataset: pd.DataFrame,
        visualize: bool = True,
        export_model: bool = True,
):
    def compute_metrics(eval_pred) -> dict:
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

    accuracy = evaluate.load('accuracy')

    batches_per_epoch = len(tokenized_dataset['train'])
    total_train_steps = int(batches_per_epoch * EPOCHS)
    optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

    model = get_pretrained_model()

    train_set = model.prepare_tf_dataset(
        tokenized_dataset['train'],
        shuffle=True,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
    )

    validation_set = model.prepare_tf_dataset(
        tokenized_dataset['test'],
        shuffle=False,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
    )

    model.compile(optimizer=optimizer)

    es = EarlyStopping(monitor='val_loss', mode='auto', verbose=1, patience=EARLY_STOPPING_PATIENCE, min_delta=EARLY_STOPPING_MIN_DELTA)

    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=validation_set)

    callbacks = [es, metric_callback]

    fitted_model = model.fit(x=train_set, validation_data=validation_set, epochs=EPOCHS, callbacks=callbacks)

    loss = model.evaluate(validation_set)

    print(Fore.GREEN + u'\n\u2713 ' + f'Loss ==> {loss}')
    print(Fore.RESET)

    model.summary()

    if export_model:
        tokenizer.save_pretrained(MODEL_OUTPUT)
        model.save_pretrained(MODEL_OUTPUT)

    if visualize:
        visualize_model(test_dataset=test_dataset, fitted_model=fitted_model)

# Methods Invocation

In [None]:
!mkdir -p  $PLOTS_OUTPUT
!mkdir -p  $MODEL_OUTPUT

In [None]:
tokenizer = get_tokenizer()

tokenized_dataset, test_dataset = prime_dataset(tokenizer=tokenizer)

model = train_model(tokenizer=tokenizer, tokenized_dataset=tokenized_dataset, test_dataset=test_dataset)

In [None]:
trained_pipeline = get_trained_pipeline()

In [None]:
text = 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.'
trained_pipeline(text, truncation=True, max_length=4096)

In [None]:
text = 'I am severly depressed'
trained_pipeline(text, truncation=True, max_length=4096)