<img src="../assets/a_type_readme.gif" style="float:right ; margin: 10px ; width:300px;"> 
<h1><left>NLP Project</left></h1>
<h4><left>Using Natural Language Processing to better understand Depression & Anxiety</left></h4>
___

## 3. Analysis

In [1]:
import numpy as np
from numpy import core, array
assert np.__version__ == "1.19.5"

import pandas as pd

import seaborn as sns
sns.set_style("darkgrid")

import sentencepiece as spm

import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import Word2Vec, KeyedVectors
# from gensim.models import Word2Vec
assert gensim.__version__ == "4.0.1"

from sklearn.model_selection import KFold, train_test_split

import dataframe_image as dfi

from pickle import dump, load

from nltk import word_tokenize

import matplotlib.pyplot as plt
%matplotlib inline

from time import time 
from random import randint
import logging 
import multiprocessing
 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding



In [2]:
logging.basicConfig(filename="../logs/5_language-model.log",
                    format='%(asctime)s > %(message)s',
                    filemode='w',
                    level=logging.INFO)

def add_time(intput_str, start_time=0):
    return "{}: {} min".format(input_str, round((time() - start_time) / 60, 2))

In [3]:
model_data = pd.read_csv('../data/data_for_model.csv', keep_default_na=False)
print(model_data.info())
data_column = "selftext_clean"
model_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1930 entries, 0 to 1929
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      1930 non-null   object
 1   selftext                   1930 non-null   object
 2   author                     1930 non-null   object
 3   score                      1930 non-null   int64 
 4   num_comments               1930 non-null   int64 
 5   is_anxiety                 1930 non-null   int64 
 6   url                        1930 non-null   object
 7   selftext_clean             1930 non-null   object
 8   selftext_broken_sentences  1930 non-null   object
 9   selftext_broken_words      1930 non-null   object
 10  title_clean                1930 non-null   object
 11  author_clean               1930 non-null   object
 12  megatext_clean             1930 non-null   object
dtypes: int64(3), object(10)
memory usage: 196.1+ KB
None


Unnamed: 0,title,selftext,author,score,num_comments,is_anxiety,url,selftext_clean,selftext_broken_sentences,selftext_broken_words,title_clean,author_clean,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,2319,175,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,['we understand that most people who reply imm...,"['understand', 'people', 'reply', 'immediately...",broken least understood rule helper may invite...,sql witch,sql witch understand people reply immediately ...
1,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,SQLwitch,312,1136,0,https://www.reddit.com/r/depression/comments/m...,welcome r depression check post place take mom...,"[""welcome to /r/depression's check-in post - a...","['welcome', 'r', 'depression', 'check', 'post'...",regular check post important reminder private ...,sql witch,sql witch welcome r depression check post plac...
2,Low,I'm so low rn I can't even type anything coher...,RagingFlock89,263,43,0,https://www.reddit.com/r/depression/comments/n...,low rn even type anything coherent want expres...,"[""i'm so low rn i can't even type anything coh...","['low', 'rn', 'even', 'type', 'anything', 'coh...",low,raging flock 89,raging flock 89 low rn even type anything cohe...


## Language Model

### Data

In [4]:
def organize_token_seqs(data, length=20+1):
    logging.info("In organize_token_seqs-> sequence length={}".format(length))    

    sequences = list()
    
    for record in data:
        tokens = word_tokenize(record)
        
        for i in range(length, len(tokens)):
            seq = tokens[i-length: i]
            assert len(seq) == length, (length, len(seq), seq)
            line = ' '.join(seq)
            sequences.append(line)
        
#     if len(sequences)== 0:
#         print(tokens)
#     print('Total Sequences: %d' % len(sequences))
    
    return sequences

In [5]:
def save_doc(lines, filename):
    logging.info("In save_doc-> filename={}".format(filename))    
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    logging.info("In save_doc-> file save at {}".format(filename))    
    
    
def load_doc(filename):
    logging.info("In load_doc-> filename={}".format(filename))    
    file = open(filename, 'r')
    text = file.read()
    file.close()
    logging.info("In load_doc-> file loaded from {}".format(filename))    
    return text

### Model

In [6]:
def build_LM_A(vocab_size, seq_length, summary=False):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    
    logging.info("In build_LM_A-> model summary\n", model.summary())
    if summary:
        print(model.summary())
    
    return model

In [7]:
def language_model(label, data, n_seq_words, model_path, tokenizer_path, processed_data_path, EPOCHS=100, BATCH=128, summary=False):
    logging.info("In language_model-> label={}, n_seq_words={}, model_path={}, tokenizer_path={}, processed_data_path={}".format(label, n_seq_words, model_path, tokenizer_path, processed_data_path))
     
    # ------------------------------ PREPARE DATA -----------------------------------
    processed_data = organize_token_seqs(data, n_seq_words)
    save_doc(processed_data, processed_data_path)

#     doc = load_doc(processed_data_path)
#     processed_data = doc.split('\n')
#     print("\nword sequences\n", processed_data[:5])
    
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(processed_data)
    unchecked_sequences = tokenizer.texts_to_sequences(processed_data)
    
    msg = "BEFORE len(Sequences): {}\n sequences lengths={}".format(len(unchecked_sequences), set([len(seq) for seq in unchecked_sequences]))
    logging.info(msg)
    print("\n" +msg)
    
    sequences = []
    for seq in unchecked_sequences:
        if len(seq) == n_seq_words:
            sequences.append(seq)
    msg = 'AFTER len(Sequences): {}\n sequences lengths={}'.format(len(sequences), set([len(seq) for seq in sequences]))
    logging.info(msg)
    print("\n" +msg)    

    vocab_size = len(tokenizer.word_index) + 1
    msg = 'tokenizer vocab_size={}'.format(vocab_size)
    logging.info(msg)
    print("\n" + msg)    
    
    # separate into input and output
    sequences = array(sequences)
    msg = 'sequences.shape={}'.format(sequences.shape)
    logging.info(msg)
    print("\n" + msg)
    
    X, y = sequences[:, :-1], sequences[:, -1]
    
    y = to_categorical(y, num_classes=vocab_size)
    seq_length = X.shape[1]+1
#     seq_length =len(X[0])
    msg = 'seq_length={}'.format(seq_length)
    logging.info(msg)
    print("\n" + msg)


    # ------------------------------ TRAIN MODEL -----------------------------------
    model = build_LM_A(vocab_size, seq_length, summary=summary)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X, y, batch_size=BATCH, epochs=EPOCHS, verbose=1) 
    
    return model, history

In [8]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = []
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [15]:
EPOCHS = 100
BATCH = 128
n_seq_words = 20
exp_count = 4
summary = True 
save = False
load_bool = True
truncation_length = 100

labels = {'depression': 0, 'anxiety': 1}
results = []

for label_str, label_int in labels.items():
    # ------------------------------ PREPARE MODEL -----------------------------------
    data = model_data[model_data["is_anxiety"] == label_int][data_column]
    
    msg = "{} total data={}".format(label_str, len(data))
    logging.info(msg)
    print(msg)
    
    model_path = '../models/{}.language_model.h5'.format(label_str)
    tokenizer_path = '../models/{}.language_model.tokenizer.pkl'.format(label_str)
    processed_data_path = '../data/{}.language_model.processed_data.txt'.format(label_str)

    if save:
        model, history = language_model(label_str, data, n_seq_words, model_path, tokenizer_path, processed_data_path, EPOCHS, BATCH, summary)
        model.save(model_path)
        dump(tokenizer, open(tokenizer_path, 'wb'))
    
    if load_bool:
        model = load_model(model_path)
        tokenizer = load(open(tokenizer_path, 'rb'))

    # ------------------------------ GENERATE EXAMPLE -----------------------------------
    logging.info("Generating examples-> label={}, n_seq_words={}".format(label_str, n_seq_words))

    for i in range(exp_count):
        seed_text = data[randint(data.index[0], data.index[-1]+1)]
        while seed_text == "emptypost":
            seed_text = data[randint(data.index[0], data.index[-1]+1)]
#         print("seed_text\n", seed_text + '\n')
        seed_text = seed_text[:truncation_length]
        
        generated = generate_seq(model, tokenizer, n_seq_words, seed_text, n_seq_words)
#         print("generated", generated)
        
        model_results = {}
        model_results["seed_text"] = seed_text
        model_results["generated"] = generated
        model_results["is_anxiety"] = label_int
        model_results["n_seq_words"] = n_seq_words
        results.append(model_results) 

depression total data=932
anxiety total data=998


In [19]:
# pd.reset_option("max_colwidth")
pd.set_option("display.max_colwidth", 1000)
results = pd.DataFrame(results)
results

Unnamed: 0,seed_text,generated,is_anxiety,n_seq_words
0,im tired unmotivated time dont want use time way trying relax take bath every couple day dont smell,lie put dropout school want come september started couple week miss 16 year ago mediocre errand coming eric covid wa,0,20
1,hi lot depression stem health issue year mid twenty dealt two botched surgieries chronic pain chroni,morning made phone broke since sister lucas said depression sister mil leave found social anxiety reducing social people open wan,0,20
2,paper life great good job young 23 fun partying friend every weekend starting working consistently l,people either others focus inside charity motivated functioning grade time group plan deeply score really defeat attention feel important incredible,0,20
3,since childhood optimistic person lately feeling void chest real shit terrible verbally abusive rela,sit second used helping cleaning feeling sometimes mess good staring saw true ha aspect hate poured real registrar new green,0,20
4,worked restaurant entire life took past year covid baby time get job idea want cooking passion away,assistant jumping besides even wanting money need change condition closest exactly thought add hate overcome parent overcome know sense meltdown,1,20
5,ssri medication calm anxiety stop feeling uneasy mentally fragile thing happen life ruminating much,staring anytime bed self social face suck overcome dont anyone ready advice cure anxiety anybody anyone ha small anxious know,1,20
6,5 6 year going er 3 time probably year thinking im heart attack multiple scan never find anything wr,staring phoniness within month ago started working energy stomach doctor debilitating numb remove dose first fine 100 look smothered humbly,1,20
7,feel like lot post looking relationship advice nothing anxiety someone really suffers horrible anxie,disappear go across slipping doctor know feel like enhanced esteem healthy anyone know going get back cooked mind wa passion,1,20


In [20]:
dfi.export(results, '../reports/images/normal-lm_examples.png')