<img src="../assets/a_type_readme.gif" style="float:right ; margin: 10px ; width:300px;"> 
<h1><left>NLP Project</left></h1>
<h4><left>Using Natural Language Processing to better understand Depression & Anxiety</left></h4>
___

## 3. Analysis

In [6]:
import numpy as np
from numpy import core, array
assert np.__version__ == "1.19.5"

import pandas as pd

import seaborn as sns
sns.set_style("darkgrid")

import torch

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from pickle import dump
from nltk import word_tokenize

import matplotlib.pyplot as plt
%matplotlib inline

from time import time 

import logging 

import multiprocessing
 
!pip install pandas transformers
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

from torch.onnx import export

Collecting transformers
  Downloading transformers-4.7.0-py3-none-any.whl (2.5 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Installing collected packages: filelock, tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed filelock-3.0.12 huggingface-hub-0.0.8 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.7.0


In [8]:
logging.basicConfig(filename="../finetune.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')
logger = logging.getLogger()

def print_time(intput_str, start_time=0):
    print("{}: {} min".format(input_str, round((time() - start_time) / 60, 2)))
    
# #Setting the threshold of logger to DEBUG
# logger.setLevel(logging.DEBUG)
  
# #Test messages
# logger.debug("Harmless debug Message")
# logger.info("Just an information")
# logger.warning("Its a Warning")
# logger.error("Did you try to divide by zero")
# logger.critical("Internet is down")

In [9]:
model_data = pd.read_csv('../data/data_for_model.csv', keep_default_na=False)
print(model_data.info())
model_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1930 entries, 0 to 1929
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      1930 non-null   object
 1   selftext                   1930 non-null   object
 2   author                     1930 non-null   object
 3   score                      1930 non-null   int64 
 4   num_comments               1930 non-null   int64 
 5   is_anxiety                 1930 non-null   int64 
 6   url                        1930 non-null   object
 7   selftext_clean             1930 non-null   object
 8   selftext_broken_sentences  1930 non-null   object
 9   selftext_broken_words      1930 non-null   object
 10  title_clean                1930 non-null   object
 11  author_clean               1930 non-null   object
 12  megatext_clean             1930 non-null   object
dtypes: int64(3), object(10)
memory usage: 196.1+ KB
None


Unnamed: 0,title,selftext,author,score,num_comments,is_anxiety,url,selftext_clean,selftext_broken_sentences,selftext_broken_words,title_clean,author_clean,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,2319,175,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,['we understand that most people who reply imm...,"['understand', 'people', 'reply', 'immediately...",broken least understood rule helper may invite...,sql witch,sql witch understand people reply immediately ...
1,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,SQLwitch,312,1136,0,https://www.reddit.com/r/depression/comments/m...,welcome r depression check post place take mom...,"[""welcome to /r/depression's check-in post - a...","['welcome', 'r', 'depression', 'check', 'post'...",regular check post important reminder private ...,sql witch,sql witch welcome r depression check post plac...
2,Low,I'm so low rn I can't even type anything coher...,RagingFlock89,263,43,0,https://www.reddit.com/r/depression/comments/n...,low rn even type anything coherent want expres...,"[""i'm so low rn i can't even type anything coh...","['low', 'rn', 'even', 'type', 'anything', 'coh...",low,raging flock 89,raging flock 89 low rn even type anything cohe...


In [10]:
data_column = "selftext_clean"
# model_data[data_column][0]
# model_data["megatext_clean"]
# sents = [eval(sent) for sent in model_data["selftext_broken_words"]]
# dp = model_data["selftext_broken_words"].tolist()[1]
# literal_eval(dp)
# model_data["megatext_clean"].to_csv(data_path, header=None, index=None, sep='\t', mode='a')

## Fine Tuning

In [11]:
model_data

Unnamed: 0,title,selftext,author,score,num_comments,is_anxiety,url,selftext_clean,selftext_broken_sentences,selftext_broken_words,title_clean,author_clean,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,2319,175,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,['we understand that most people who reply imm...,"['understand', 'people', 'reply', 'immediately...",broken least understood rule helper may invite...,sql witch,sql witch understand people reply immediately ...
1,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,SQLwitch,312,1136,0,https://www.reddit.com/r/depression/comments/m...,welcome r depression check post place take mom...,"[""welcome to /r/depression's check-in post - a...","['welcome', 'r', 'depression', 'check', 'post'...",regular check post important reminder private ...,sql witch,sql witch welcome r depression check post plac...
2,Low,I'm so low rn I can't even type anything coher...,RagingFlock89,263,43,0,https://www.reddit.com/r/depression/comments/n...,low rn even type anything coherent want expres...,"[""i'm so low rn i can't even type anything coh...","['low', 'rn', 'even', 'type', 'anything', 'coh...",low,raging flock 89,raging flock 89 low rn even type anything cohe...
3,I’m always amazed at how much energy healthy p...,When I wake up after 8 hours of decent sleep I...,cezzzie,1281,120,0,https://www.reddit.com/r/depression/comments/n...,wake 8 hour decent sleep still exhausted day e...,['when i wake up after 8 hours of decent sleep...,"['wake', '8', 'hour', 'decent', 'sleep', 'stil...",always amazed much energy healthy people,ce zzz ie,ce zzz ie wake 8 hour decent sleep still exhau...
4,30 and never lived a day in my life,I guess i have always been depressed but never...,ApprehensiveYou2385,36,5,0,https://www.reddit.com/r/depression/comments/n...,guess always depressed never really thought mu...,['i guess i have always been depressed but nev...,"['guess', 'always', 'depressed', 'never', 'rea...",30 never lived day life,apprehensive 2385,apprehensive 2385 guess always depressed never...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1925,Any tips on how not to panic during a midterm?,I have an Applied Statics midterm tomorrow. I ...,Anomalistic_Username,2,0,1,https://www.reddit.com/r/Anxiety/comments/n3p5...,applied static midterm tomorrow already failed...,['i have an applied statics midterm tomorrow.'...,"['applied', 'static', 'midterm', 'tomorrow', '...",tip panic midterm,mali stic username,mali stic username applied static midterm tomo...
1926,"I find myself apologizing really often, checki...",I've recently decided to stop smoking weed (so...,zedhenson,3,1,1,https://www.reddit.com/r/Anxiety/comments/n3n1...,recently decided stop smoking weed socially th...,"[""i've recently decided to stop smoking weed (...","['recently', 'decided', 'stop', 'smoking', 'we...",find apologizing really often checking behavior,zed henson,zed henson recently decided stop smoking weed ...
1927,I typed out my anxiety attack and thought I sh...,I recently got into a little habit where when ...,Tree-Nui-Tee,2,4,1,https://www.reddit.com/r/Anxiety/comments/n3oz...,recently got little habit thing get overwhelmi...,['i recently got into a little habit where whe...,"['recently', 'got', 'little', 'habit', 'thing'...",typed anxiety attack thought share,tree nui tee,tree nui tee recently got little habit thing g...
1928,something happened that just triggered my anxi...,i need someone to vent to please,Capzfan5,8,5,1,https://www.reddit.com/r/Anxiety/comments/n3gc...,need someone vent please,['i need someone to vent to please'],"['need', 'someone', 'vent', 'please']",something happened triggered anxiety really bad,cap z fan 5,cap z fan 5 need someone vent please something...


In [None]:
train_queries, val_queries, train_docs, val_docs, train_labels, val_labels = train_test_split(
    training_data["query"].tolist(), 
    training_data["title"].tolist(), 
    training_data["label"].tolist(), 
    test_size=.2
)

In [None]:
model_name = "google/bert_uncased_L-4_H-512_A-8"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(train_queries, train_docs, truncation=True, padding='max_length', max_length=128)
val_encodings = tokenizer(val_queries, val_docs, truncation=True, padding='max_length', max_length=128)

In [None]:
class Cord19Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Cord19Dataset(train_encodings, train_labels)
val_dataset = Cord19Dataset(val_encodings, val_labels)

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name)

In [None]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.    
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
device = torch.device("cuda") 

model_onnx_path = "model.onnx"
dummy_input = (
    train_dataset[0]["input_ids"].unsqueeze(0).to(device), 
    train_dataset[0]["token_type_ids"].unsqueeze(0).to(device), 
    train_dataset[0]["attention_mask"].unsqueeze(0).to(device)
)
input_names = ["input_ids", "token_type_ids", "attention_mask"]
output_names = ["logits"]
export(
    model, dummy_input, model_onnx_path, input_names = input_names, 
    output_names = output_names, verbose=False, opset_version=11
)