<img src="../assets/a_type_readme.gif" style="float:right ; margin: 10px ; width:300px;"> 
<h1><left>NLP Project</left></h1>
<h4><left>Using Natural Language Processing to better understand Depression & Anxiety</left></h4>
___

## 3. Analysis

In [2]:
import numpy as np
from numpy import core, array
assert np.__version__ == "1.19.5"

import pandas as pd

import seaborn as sns
sns.set_style("darkgrid")
import dataframe_image as dfi

from sklearn.model_selection import KFold, train_test_split
import sentencepiece as spm

import matplotlib.pyplot as plt
%matplotlib inline

from time import time 
import logging 
import multiprocessing

In [3]:
logging.basicConfig(filename="../logs/4_tokenization.log",
                    format='%(asctime)s %(message)s',
                    filemode='w',
                    level=logging.INFO)

def add_time(intput_str, start_time=0):
    print("{}: {} min".format(input_str, round((time() - start_time) / 60, 2)))

In [4]:
model_data = pd.read_csv('../data/data_for_model.csv', keep_default_na=False)
data_column = "selftext_clean"
print(model_data.info())
model_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1930 entries, 0 to 1929
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      1930 non-null   object
 1   selftext                   1930 non-null   object
 2   author                     1930 non-null   object
 3   score                      1930 non-null   int64 
 4   num_comments               1930 non-null   int64 
 5   is_anxiety                 1930 non-null   int64 
 6   url                        1930 non-null   object
 7   selftext_clean             1930 non-null   object
 8   selftext_broken_sentences  1930 non-null   object
 9   selftext_broken_words      1930 non-null   object
 10  title_clean                1930 non-null   object
 11  author_clean               1930 non-null   object
 12  megatext_clean             1930 non-null   object
dtypes: int64(3), object(10)
memory usage: 196.1+ KB
None


Unnamed: 0,title,selftext,author,score,num_comments,is_anxiety,url,selftext_clean,selftext_broken_sentences,selftext_broken_words,title_clean,author_clean,megatext_clean
0,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,SQLwitch,2319,175,0,https://www.reddit.com/r/depression/comments/d...,understand people reply immediately op invitat...,['we understand that most people who reply imm...,"['understand', 'people', 'reply', 'immediately...",broken least understood rule helper may invite...,sql witch,sql witch understand people reply immediately ...
1,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,SQLwitch,312,1136,0,https://www.reddit.com/r/depression/comments/m...,welcome r depression check post place take mom...,"[""welcome to /r/depression's check-in post - a...","['welcome', 'r', 'depression', 'check', 'post'...",regular check post important reminder private ...,sql witch,sql witch welcome r depression check post plac...
2,Low,I'm so low rn I can't even type anything coher...,RagingFlock89,263,43,0,https://www.reddit.com/r/depression/comments/n...,low rn even type anything coherent want expres...,"[""i'm so low rn i can't even type anything coh...","['low', 'rn', 'even', 'type', 'anything', 'coh...",low,raging flock 89,raging flock 89 low rn even type anything cohe...


## Tokenization

In [5]:
def tokenization(data_np, vocab_size):
    logging.info("In tokenization-> vocab_size={}".format(vocab_size))    
#     train, test = train_test_split(data_np, test_size=0.2, random_state=42)
    
    i = 0
    for train_idx, test_idx in kfold.split(data_np):
        i += 1
        train_data = data_np[train_idx]
        test_data = data_np[test_idx]

        train_path = "../data/tokenization/train_vs{}_i{}.txt".format(vocab_size, i)
        np.savetxt(train_path, train_data, fmt='%s')
        
        model_prefix = '../models/tokenization/vs{}_i{}'.format(vocab_size, i)
        
        spm.SentencePieceTrainer.train(
            input=train_path,
            model_prefix=model_prefix,
            vocab_size=vocab_size,
            unk_id=3,
            model_type='word'
        )   
        sp = spm.SentencePieceProcessor()   # create an instance; this saves model and .vocab files 
        sp.load('{}.model'.format(model_prefix))    # loads model

    #     data_subwords  = [sp.id_to_piece(piece_id) for piece_id in range(sp.get_piece_size())]          # list of subwords
        unks_count = 0
        tokens_count = 0

        for post in test_data:
            sp_encoded = sp.encode_as_ids(post)
            # print(sp.encode_as_pieces(post))
            tokens_count += len(sp_encoded)
            unks_count += sp_encoded.count(3)

        percentage = unks_count / tokens_count * 100

        results.append({
            "vocab_size": vocab_size,
            "iteration": i,
            "unks_count": unks_count,
            "all_tokens_count": tokens_count,
            "unks_percentage": percentage
        }) 

        msg = "\t vocab_size={} iteration={}: unks_count={}, all_tokens_count={} =>{}%".format(vocab_size, i, unks_count, tokens_count, percentage)
        logging.info(msg)
        print(msg)

In [6]:
results = []

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
data_np = model_data[data_column].to_numpy()

for vocab_size in [20, 100, 500, 1500, 4000, 9000]:
    tokenization(data_np, vocab_size=vocab_size)
    print("\n")

results = pd.DataFrame(results)
results.style.background_gradient()

	 vocab_size=20 iteration=1: unks_count=3962, all_tokens_count=8302 =>47.72344013490725%
	 vocab_size=20 iteration=2: unks_count=4324, all_tokens_count=9085 =>47.59493670886076%
	 vocab_size=20 iteration=3: unks_count=4273, all_tokens_count=9001 =>47.47250305521609%
	 vocab_size=20 iteration=4: unks_count=4303, all_tokens_count=9033 =>47.6364441492306%
	 vocab_size=20 iteration=5: unks_count=4334, all_tokens_count=9145 =>47.392017495899395%


	 vocab_size=100 iteration=1: unks_count=6743, all_tokens_count=17627 =>38.2538151699098%
	 vocab_size=100 iteration=2: unks_count=7527, all_tokens_count=19958 =>37.7141998196212%
	 vocab_size=100 iteration=3: unks_count=7285, all_tokens_count=19052 =>38.237455385261384%
	 vocab_size=100 iteration=4: unks_count=7169, all_tokens_count=18789 =>38.155303635105646%
	 vocab_size=100 iteration=5: unks_count=7303, all_tokens_count=19214 =>38.00874362444051%


	 vocab_size=500 iteration=1: unks_count=6203, all_tokens_count=25842 =>24.003560095967806%
	 vo

Unnamed: 0,vocab_size,iteration,unks_count,all_tokens_count,unks_percentage
0,20,1,3962,8302,47.72344
1,20,2,4324,9085,47.594937
2,20,3,4273,9001,47.472503
3,20,4,4303,9033,47.636444
4,20,5,4334,9145,47.392017
5,100,1,6743,17627,38.253815
6,100,2,7527,19958,37.7142
7,100,3,7285,19052,38.237455
8,100,4,7169,18789,38.155304
9,100,5,7303,19214,38.008744


In [7]:
dfi.export(results, '../reports/images/token_vocab-size.png')

In [18]:
def best_tekinzer(data_np, vocab_size):
    logging.info("In best_tekinzer-> vocab_size={}".format(vocab_size))    

    data_path = "../data/tokenization/all_data.txt"
    np.savetxt(data_path, data_np, fmt='%s')
    
    model_prefix = '../models/tokenization'
    
    spm.SentencePieceTrainer.train(
        input=data_path,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        unk_id=3,
        model_type='word'
    )   
    sp = spm.SentencePieceProcessor()   # create an instance; this saves model and .vocab files 
    sp.load('{}.model'.format(model_prefix))    # loads model

In [19]:
best_tekinzer(data_np, vocab_size=9000)