<img src="../assets/a_type_readme.gif" style="float:right ; margin: 10px ; width:300px;"> 
<h1><left>NLP Project</left></h1>
<h4><left>Using Natural Language Processing to better understand Depression & Anxiety</left></h4>
___

## 3. Analysis

In [1]:
import numpy as np
from numpy import core, array
assert np.__version__ == "1.19.5"

import pandas as pd

import seaborn as sns
sns.set_style("darkgrid")

import sentencepiece as spm

import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import Word2Vec, KeyedVectors
# from gensim.models import Word2Vec
assert gensim.__version__ == "4.0.1"

from sklearn.model_selection import KFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from pickle import dump
from nltk import word_tokenize

import matplotlib.pyplot as plt
%matplotlib inline

from time import time 

import logging 

import multiprocessing
 
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding



In [2]:
# pip install --upgrade --user numpy
np.__version__

'1.19.5'

In [3]:
logging.basicConfig(filename="../logs/4_tokenization.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')
logger = logging.getLogger()

def print_time(intput_str, start_time=0):
    print("{}: {} min".format(input_str, round((time() - start_time) / 60, 2)))
    
# #Setting the threshold of logger to DEBUG
# logger.setLevel(logging.DEBUG)
  
# #Test messages
# logger.debug("Harmless debug Message")
# logger.info("Just an information")
# logger.warning("Its a Warning")
# logger.error("Did you try to divide by zero")
# logger.critical("Internet is down")

In [4]:
model_data = pd.read_csv('../data/data_for_model.csv', keep_default_na=False)
print(model_data.info())
model_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1930 entries, 0 to 1929
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      1930 non-null   object
 1   selftext                   1930 non-null   object
 2   author                     1930 non-null   object
 3   score                      1930 non-null   int64 
 4   num_comments               1930 non-null   int64 
 5   is_anxiety                 1930 non-null   int64 
 6   url                        1930 non-null   object
 7   selftext_clean             1930 non-null   object
 8   selftext_broken_sentences  1930 non-null   object
 9   selftext_broken_words      1930 non-null   object
 10  title_clean                1930 non-null   object
 11  author_clean               1930 non-null   object
 12  megatext_clean             1930 non-null   object
dtypes: int64(3), object(10)
memory usage: 196.1+ KB
None


Unnamed: 0,title,selftext,author,score,num_comments,is_anxiety,url,selftext_clean,selftext_broken_sentences,selftext_broken_words,title_clean,author_clean,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,2319,175,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,['we understand that most people who reply imm...,"['understand', 'people', 'reply', 'immediately...",broken least understood rule helper may invite...,sql witch,sql witch understand people reply immediately ...
1,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,SQLwitch,312,1136,0,https://www.reddit.com/r/depression/comments/m...,welcome r depression check post place take mom...,"[""welcome to /r/depression's check-in post - a...","['welcome', 'r', 'depression', 'check', 'post'...",regular check post important reminder private ...,sql witch,sql witch welcome r depression check post plac...
2,Low,I'm so low rn I can't even type anything coher...,RagingFlock89,263,43,0,https://www.reddit.com/r/depression/comments/n...,low rn even type anything coherent want expres...,"[""i'm so low rn i can't even type anything coh...","['low', 'rn', 'even', 'type', 'anything', 'coh...",low,raging flock 89,raging flock 89 low rn even type anything cohe...


In [5]:
data_column = "selftext_clean"
# model_data[data_column][0]
# model_data["megatext_clean"]
# sents = [eval(sent) for sent in model_data["selftext_broken_words"]]
# dp = model_data["selftext_broken_words"].tolist()[1]
# literal_eval(dp)
# model_data["megatext_clean"].to_csv(data_path, header=None, index=None, sep='\t', mode='a')

## Tokenization

In [19]:
def tokenization(vocab_size, data_column):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    data_np = model_data[data_column].to_numpy()

#     train, test = train_test_split(data_np, test_size=0.2, random_state=42)
#     train_path = "../data/tokenization/train_vs{}_i{}.txt".format(vocab_size, i)
#     np.savetxt(train_path, train, fmt='%s')
    
    mean_unks_count = 0
    i = 0
#     for i in range(5):
    for train_idx, test_idx in kfold.split(data_np):
        i += 1
        train_path = "../data/tokenization/train_vs{}_i{}.txt".format(vocab_size, i)
#         train_path = "../data/tokenization/train_vs{}.txt".format(vocab_size)
        np.savetxt(train_path, data_np[train_idx], fmt='%s')
        
        model_prefix = '../models/tokenization/vs{}_i{}'.format(vocab_size, i)
#         model_prefix = '../models/tokenization/vs{}'.format(vocab_size)
        
        spm.SentencePieceTrainer.train(input=train_path, model_prefix=model_prefix, vocab_size=vocab_size)   
        sp = spm.SentencePieceProcessor()                                                               # create an instance; this saves model and .vocab files 
        sp.load('{}.model'.format(model_prefix))                                                        # loads model

    #     data_subwords  = [sp.id_to_piece(piece_id) for piece_id in range(sp.get_piece_size())]          # list of subwords
        unks_count = 0
        for post in data_np[test_idx]:
            sp_encoded = sp.encode_as_ids(post)
            # print(sp.encode_as_pieces(post))
            unks_count += sp_encoded.count(0)

        print("\t{}: {}".format(i, unks_count))
        mean_unks_count += unks_count
    mean_unks_count /= 5
    print("mean =", mean_unks_count)

In [20]:
for vocab_size in [50, 200, 500, 1000, 2000, 4000]:
    print("\nvocab_size =", vocab_size)
    tokenization(vocab_size=vocab_size, data_column=data_column)


vocab_size = 50
	1: 65
	2: 70
	3: 128
	4: 64
	5: 127
mean = 90.8

vocab_size = 200
	1: 65
	2: 70
	3: 128
	4: 64
	5: 127
mean = 90.8

vocab_size = 500
	1: 65
	2: 70
	3: 128
	4: 64
	5: 127
mean = 90.8

vocab_size = 1000
	1: 65
	2: 70
	3: 128
	4: 64
	5: 127
mean = 90.8

vocab_size = 2000
	1: 65
	2: 70
	3: 128
	4: 64
	5: 127
mean = 90.8

vocab_size = 4000
	1: 65
	2: 70
	3: 128
	4: 64
	5: 127
mean = 90.8
