<img src="../assets/a_type_readme.gif" style="float:right ; margin: 10px ; width:300px;"> 
<h1><left>NLP Project</left></h1>
<h4><left>Using Natural Language Processing to better understand Depression & Anxiety</left></h4>
___

## 3. Analysis

In [1]:
import numpy as np
from numpy import core, array
# assert np.__version__ == "1.19.5"

import pandas as pd

from random import randint

# import seaborn as sns
# sns.set_style("darkgrid")

import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from pickle import dump

import matplotlib.pyplot as plt
%matplotlib inline

from time import time 

import logging 

import multiprocessing

from datasets import load_metric
 
# !pip install pandas transformers
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import random
from torch.onnx import export

In [2]:
logging.basicConfig(filename="../logs/6_finetune_classification.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')
logger = logging.getLogger()

def print_time(intput_str, start_time=0):
    print("{}: {} min".format(input_str, round((time() - start_time) / 60, 2)))
    
# #Setting the threshold of logger to DEBUG
# logger.setLevel(logging.DEBUG)
  
# #Test messages
# logger.debug("Harmless debug Message")
# logger.info("Just an information")
# logger.warning("Its a Warning")
# logger.error("Did you try to divide by zero")
# logger.critical("Internet is down")

In [3]:
model_data = pd.read_csv('../data/data_for_model.csv', keep_default_na=False)
print(model_data.info())
model_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1930 entries, 0 to 1929
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      1930 non-null   object
 1   selftext                   1930 non-null   object
 2   author                     1930 non-null   object
 3   score                      1930 non-null   int64 
 4   num_comments               1930 non-null   int64 
 5   is_anxiety                 1930 non-null   int64 
 6   url                        1930 non-null   object
 7   selftext_clean             1930 non-null   object
 8   selftext_broken_sentences  1930 non-null   object
 9   selftext_broken_words      1930 non-null   object
 10  title_clean                1930 non-null   object
 11  author_clean               1930 non-null   object
 12  megatext_clean             1930 non-null   object
dtypes: int64(3), object(10)
memory usage: 196.1+ KB
None


Unnamed: 0,title,selftext,author,score,num_comments,is_anxiety,url,selftext_clean,selftext_broken_sentences,selftext_broken_words,title_clean,author_clean,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,2319,175,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,['we understand that most people who reply imm...,"['understand', 'people', 'reply', 'immediately...",broken least understood rule helper may invite...,sql witch,sql witch understand people reply immediately ...
1,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,SQLwitch,312,1136,0,https://www.reddit.com/r/depression/comments/m...,welcome r depression check post place take mom...,"[""welcome to /r/depression's check-in post - a...","['welcome', 'r', 'depression', 'check', 'post'...",regular check post important reminder private ...,sql witch,sql witch welcome r depression check post plac...
2,Low,I'm so low rn I can't even type anything coher...,RagingFlock89,263,43,0,https://www.reddit.com/r/depression/comments/n...,low rn even type anything coherent want expres...,"[""i'm so low rn i can't even type anything coh...","['low', 'rn', 'even', 'type', 'anything', 'coh...",low,raging flock 89,raging flock 89 low rn even type anything cohe...


In [4]:
data_column = "selftext_clean"
labels = {1: "anxiety", 0: "depression"}
labels_str2idx = {'depression': 0, 'anxiety': 1}
# model_data["megatext_clean"].to_csv(data_path, header=None, index=None, sep='\t', mode='a')

## Fine Tuning

### Classification

In [5]:
# max_length = 128
# max sequence length for each document/sentence sample
max_length = 512
EPOCHS = 10
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8

#### Prepare data and model

In [6]:
train_text, val_text, train_labels, val_labels = train_test_split(
    model_data[data_column].tolist(), 
    model_data["is_anxiety"].tolist(), 
    test_size=.2
)

In [7]:
model_name = "google/bert_uncased_L-4_H-512_A-8"
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(train_text, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_text, truncation=True, padding=True, max_length=max_length)

In [8]:
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = RedditDataset(train_encodings, train_labels)
val_dataset = RedditDataset(val_encodings, val_labels)

In [9]:
# load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(labels)).to("cuda")

Some weights of the model checkpoint at google/bert_uncased_L-4_H-512_A-8 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

#### Train

In [10]:
training_args = TrainingArguments(
    output_dir='../models/bert_classification_lm', # output directory
    evaluation_strategy="epoch",                   # Evaluation is done at the end of each epoch.
    num_train_epochs=EPOCHS,                       # total number of training epochs
    per_device_train_batch_size=TRAIN_BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=EVAL_BATCH_SIZE,    # batch size for evaluation
    warmup_steps=500,                              # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                             # strength of weight decay
    logging_dir ='../logs/bert_classification_lm', # directory for storing logs
    logging_steps=200,                             # log & save weights each logging_steps
    load_best_model_at_end=True,                   # load the best model when finished training (default metric is loss) 
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    save_total_limit=1,                            # limit the total amount of checkpoints. Deletes the older checkpoints.    
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,0.446038,1.1875,325.04
2,0.606900,0.370412,1.1886,324.739
3,0.374500,0.347635,1.196,322.729
4,0.294500,0.534144,1.1991,321.916
5,0.227300,0.571521,1.2018,321.177
6,0.121500,0.631738,1.2054,320.236
7,0.088700,0.716783,1.2076,319.642
8,0.062300,0.753405,1.2109,318.777
9,0.048100,0.778459,1.2107,318.82
10,0.041200,0.779626,1.2113,318.662


TrainOutput(global_step=1930, training_loss=0.1956141222326249, metrics={'train_runtime': 191.3326, 'train_samples_per_second': 10.087, 'total_flos': 1364356812472320, 'epoch': 10.0})

#### Evaluate

In [12]:
metric = load_metric("accuracy")

def compute_metrics1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics2(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

In [13]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics1
)
trainer.evaluate()

{'eval_loss': 0.779626190662384,
 'eval_accuracy': 0.8730569948186528,
 'eval_runtime': 1.2255,
 'eval_samples_per_second': 314.975}

#### Save & Load

In [14]:
model_path = "../models/bert_classification_lm"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('../models/bert_classification_lm/tokenizer_config.json',
 '../models/bert_classification_lm/special_tokens_map.json',
 '../models/bert_classification_lm/vocab.txt',
 '../models/bert_classification_lm/added_tokens.json')

In [15]:
# # reload our model/tokenizer. Optional, only usable when in Python files instead of notebooks
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(labels)).to("cuda")
tokenizer = BertTokenizerFast.from_pretrained(model_path)

#### Examples

In [16]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
#     return labels[probs.argmax().item()]
    return probs.argmax().item()

In [17]:
exp_count = 2
results = []

for label_str, label_int in labels_str2idx.items():
    data = model_data[model_data["is_anxiety"] == label_int][data_column]
    
    for i in range(exp_count):
        seed_text = data[randint(data.index[0], data.index[-1]+1)]

        generated = get_prediction(seed_text)
        
        model_results = {}
        model_results["seed_text"] = seed_text
        model_results["predicted"] = generated
        model_results["actual"] = label_int

        results.append(model_results) 

pd.DataFrame(results)

Unnamed: 0,seed_text,predicted,actual
0,tired tired fighting live care anything tired ...,0,0
1,know one perfect life bad like literally every...,0,0
2,17 ive always attracted woman think pocd destr...,1,1
3,love sport play never coached never played spo...,1,1
