<img src="../assets/a_type_readme.gif" style="float:right ; margin: 10px ; width:300px;"> 
<h1><left>NLP Project</left></h1>
<h4><left>Using Natural Language Processing to better understand Depression & Anxiety</left></h4>
___

## 3. Analysis

In [33]:
import numpy as np
from numpy import core, array
assert np.__version__ == "1.19.5"

import pandas as pd

import seaborn as sns
sns.set_style("darkgrid")

import sentencepiece as spm

import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import Word2Vec, KeyedVectors
# from gensim.models import Word2Vec
assert gensim.__version__ == "4.0.1"

from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from pickle import dump, load

from nltk import word_tokenize

import matplotlib.pyplot as plt
%matplotlib inline

from time import time 

from random import randint

import logging 

import multiprocessing
 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding

In [3]:
logging.basicConfig(filename="../logs/5_language-model.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')
logger = logging.getLogger()

def print_time(intput_str, start_time=0):
    print("{}: {} min".format(input_str, round((time() - start_time) / 60, 2)))
    
# #Setting the threshold of logger to DEBUG
# logger.setLevel(logging.DEBUG)
  
# #Test messages
# logger.debug("Harmless debug Message")
# logger.info("Just an information")
# logger.warning("Its a Warning")
# logger.error("Did you try to divide by zero")
# logger.critical("Internet is down")

In [4]:
model_data = pd.read_csv('../data/data_for_model.csv', keep_default_na=False)
print(model_data.info())
data_column = "selftext_clean"
model_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1930 entries, 0 to 1929
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      1930 non-null   object
 1   selftext                   1930 non-null   object
 2   author                     1930 non-null   object
 3   score                      1930 non-null   int64 
 4   num_comments               1930 non-null   int64 
 5   is_anxiety                 1930 non-null   int64 
 6   url                        1930 non-null   object
 7   selftext_clean             1930 non-null   object
 8   selftext_broken_sentences  1930 non-null   object
 9   selftext_broken_words      1930 non-null   object
 10  title_clean                1930 non-null   object
 11  author_clean               1930 non-null   object
 12  megatext_clean             1930 non-null   object
dtypes: int64(3), object(10)
memory usage: 196.1+ KB
None


Unnamed: 0,title,selftext,author,score,num_comments,is_anxiety,url,selftext_clean,selftext_broken_sentences,selftext_broken_words,title_clean,author_clean,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,2319,175,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,['we understand that most people who reply imm...,"['understand', 'people', 'reply', 'immediately...",broken least understood rule helper may invite...,sql witch,sql witch understand people reply immediately ...
1,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,SQLwitch,312,1136,0,https://www.reddit.com/r/depression/comments/m...,welcome r depression check post place take mom...,"[""welcome to /r/depression's check-in post - a...","['welcome', 'r', 'depression', 'check', 'post'...",regular check post important reminder private ...,sql witch,sql witch welcome r depression check post plac...
2,Low,I'm so low rn I can't even type anything coher...,RagingFlock89,263,43,0,https://www.reddit.com/r/depression/comments/n...,low rn even type anything coherent want expres...,"[""i'm so low rn i can't even type anything coh...","['low', 'rn', 'even', 'type', 'anything', 'coh...",low,raging flock 89,raging flock 89 low rn even type anything cohe...


## Language Model

### Data

### Model

In [15]:
def organize_token_seqs(data, length=20+1):
    sequences = list()
    
    for record in data:
        tokens = word_tokenize(record)
        
        for i in range(length, len(tokens)):
            seq = tokens[i-length: i]
            assert len(seq) == length, (length, len(seq), seq)
            line = ' '.join(seq)
            sequences.append(line)
        
#     if len(sequences)== 0:
#         print(tokens)
#     print('Total Sequences: %d' % len(sequences))
    
    return sequences

In [16]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
    
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [17]:
def build_LM_A(vocab_size, seq_length, summary=False):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    if summary:
        print(model.summary())
    
    return model

In [38]:
def language_model(label, data, n_seq_words, EPOCHS=100, BATCH=128):
    model_path = '../models/{}.language_model.h5'.format(label)
    tokenizer_path = '../models/{}.language_model.tokenizer.pkl'.format(label)
    processed_data_path = '../data/{}.language_model.processed_data.txt'.format(label)
    
    # ------------------------------ PREPARE DATA -----------------------------------
    processed_data = organize_token_seqs(data, n_seq_words)
    save_doc(processed_data, processed_data_path)
    
#     doc = load_doc(processed_data_path)
#     processed_data = doc.split('\n')
#     print("\nword sequences\n", processed_data[:5])
    
    
    # integer encode sequences of words
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(processed_data)
    unchecked_sequences = tokenizer.texts_to_sequences(processed_data)
    print('\nTotal before Sequences: %d' % len(unchecked_sequences))
    print("int sequences lengths", set([len(seq) for seq in unchecked_sequences]))    
    #     print("\nint sequences\n", sequences[:5])
    
    
    sequences = []
    for seq in unchecked_sequences:
        if len(seq) == n_seq_words:
            sequences.append(seq)
    print('\nTotal after Sequences: %d' % len(sequences))
    print("int sequences lengths", set([len(seq) for seq in sequences]))    
    

    vocab_size = len(tokenizer.word_index) + 1
    print("\nvocab_size", vocab_size)
    
    # separate into input and output
    sequences = array(sequences)
#     print("\nsequences", sequences)
    print("\nsequences.shape", sequences.shape)
    
    X, y = sequences[:,:-1], sequences[:,-1]
    print("X\n", X)
    print("y\n", y)
    
    y = to_categorical(y, num_classes=vocab_size)
    seq_length = X.shape[1]+1
#     seq_length =len(X[0])
    print("seq_length", seq_length)
    
    # ------------------------------ TRAIN MODEL -----------------------------------
    model = build_LM_A(vocab_size, seq_length, summary=True)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X, y, batch_size=BATCH, epochs=EPOCHS, verbose=1) 
    
    model.save(model_path)
    dump(tokenizer, open(tokenizer_path, 'wb'))
    
    return model

In [39]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = []
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [66]:
EPOCHS = 100
BATCH = 128
n_seq_words = 20
exp_count = 2

labels = {'depression': 0, 'anxiety': 1}
results = []

for label_str, label_int in labels.items():
    data = model_data[model_data["is_anxiety"] == label_int][data_column]
#     model = language_model(label_str, data, n_seq_words, EPOCHS, BATCH)
    
    print(label_str, "total data =", len(data))
    
    model_path = '../models/{}.language_model.h5'.format(label_str)
    tokenizer_path = '../models/{}.language_model.tokenizer.pkl'.format(label_str)
    model = load_model(model_path)
    tokenizer = load(open(tokenizer_path, 'rb'))
    
    
    for i in range(exp_count):
        seed_text = data[randint(data.index[0], data.index[-1]+1)]

#         print("seed_text\n", seed_text + '\n')
        
#         generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
        generated = generate_seq(model, tokenizer, n_seq_words, seed_text, n_seq_words)
#         print("i exp:", generated)
        
        model_results = {}
        model_results["seed_text"] = seed_text
        model_results["generated"] = generated
        model_results["is_anxiety"] = label_int
        model_results["n_seq_words"] = n_seq_words
        results.append(model_results) 

pd.set_option("display.max_colwidth", 1000)
results = pd.DataFrame(results)
results

depression total data = 932
anxiety total data = 998


Unnamed: 0,seed_text,generated,is_anxiety,n_seq_words
0,firmly believe social anxiety sole reason deal...,enjoy water mate major gave center shower deal...,0,20
1,much ha happened last year covid ha definitely...,away feel like shallowest blandest person thin...,0,20
2,think everyone understands fear missing fomo u...,neighbor passionate therapy deeper word trigge...,1,20
3,took first dose lexapro today doctor advised s...,anxiety fear thought way something wa talking ...,1,20


In [None]:
dfi.export(results, '../reports/images/normal-lm_examples.png')