In [3]:
import re
import pandas as pd
import nltk
import os
import argparse
from tqdm import tqdm

from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.data import find

from transformers import BertTokenizer, BertModel
from transformers import T5Tokenizer, T5Model

import torch

In [8]:
# Import raw dataset
df_train = pd.read_csv('raw-dataset/train.csv')
df_test = pd.read_csv('raw-dataset/test.csv')

# Set tokenizers
sentence_tokenizer = PunktSentenceTokenizer()
tb_tokenizer = TreebankWordTokenizer()

In [5]:
tokenized_comments_test = df_test['CONTENT'].str.lower().apply(tb_tokenizer.tokenize)
tokenized_comments_train = df_train['CONTENT'].str.lower().apply(tb_tokenizer.tokenize)

In [9]:
df_test['TOKENIZED'] = df_test['CONTENT'].str.lower().apply(tb_tokenizer.tokenize)
df_train['TOKENIZED'] = df_train['CONTENT'].str.lower().apply(tb_tokenizer.tokenize)


In [13]:
dir_name = './tokenized_dataset/'
output_path_test = os.path.join(dir_name, f'test.csv')
output_path_train = os.path.join(dir_name, f'train.csv')
df_test.to_csv(output_path_test)
df_train.to_csv(output_path_train)

\ufeff

we need to do something about utf encoding

In [61]:
test_sentence = df_train['CONTENT'][8]
print(f'Original sentence: {test_sentence}')
tokenized_sentence = sentence_tokenizer.tokenize(test_sentence)
tokenized_word = tb_tokenizer.tokenize(test_sentence)
print(f'Sentence Tokenized: {tokenized_sentence}')
print(f'Word Tokenized: {tokenized_word}\n\n')

spam = df_train['CONTENT'][24]
tokenized_word_spam = tb_tokenizer.tokenize(spam)
len(tokenized_word_spam)


emojis = df_train['CONTENT'][77]
tokenized_word_emoji = tb_tokenizer.tokenize(emojis)
tokenized_sentence_emoji = sentence_tokenizer.tokenize(emojis)
print(emojis)
print(f'Emoji Sentence Tokenized: {tokenized_sentence_emoji}')
print(f'Emoji Word Tokenized: {tokenized_emojis}\n\n')

Original sentence: Best for partying ﻿
Sentence Tokenized: ['Best for partying \ufeff']
Word Tokenized: ['Best', 'for', 'partying', '\ufeff']


Hiya😊 I just started YouTube and it would mean a lot if some of you could  subscribe and watch my first video?xx﻿
Emoji Sentence Tokenized: ['Hiya😊 I just started YouTube and it would mean a lot if some of you could  subscribe and watch my first video?xx\ufeff']
Emoji Word Tokenized: ['Hiya😊', 'I', 'just', 'started', 'YouTube', 'and', 'it', 'would', 'mean', 'a', 'lot', 'if', 'some', 'of', 'you', 'could', 'subscribe', 'and', 'watch', 'my', 'first', 'video', '?', 'xx\ufeff']




In [62]:
from transformers import BertTokenizer, BertModel
testing = "I am happi" # some sort of +
testing = "I am happiness" # some sort of positi
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)
encoding = bert_tokenizer.encode(testing)
print(encoding)
print(bert_tokenizer.convert_ids_to_tokens(encoding))

# word frequencies find some correlation between spam vs ham
# there is not much pattern between the two other than spam being extra (ham is a subset of spam comments)



[101, 1045, 2572, 5292, 9397, 2072, 102]
['[CLS]', 'i', 'am', 'ha', '##pp', '##i', '[SEP]']


In [56]:
inputs = bert_tokenizer(testing, return_tensors='pt', padding=True, truncation=True, max_length=512)
# Padding: need some extra spaces for empty tokens for short sentences
# truncation: self explanatory with max_length
with torch.no_grad(): # no gradient calculation
    outputs = bert_model(**inputs)

# Extract the token embedding (for classification tasks)
embeddings = outputs.last_hidden_state[:, 0, :] 
embeddings.numpy()

array([[-1.18248709e-01,  3.26443076e-01, -3.30463529e-01,
        -1.60292730e-01, -5.07776916e-01, -4.28412519e-02,
         3.52290273e-01,  6.41876996e-01, -5.96303344e-02,
        -4.48829830e-01, -3.12051289e-02, -4.85484228e-02,
         1.88860983e-01,  4.01469767e-01,  2.85150647e-01,
        -2.07036763e-01, -1.18220814e-01,  7.14646518e-01,
         4.93387163e-01,  1.78066477e-01,  7.49408081e-02,
        -3.23495656e-01, -3.68121535e-01, -2.88429353e-02,
        -2.72361636e-01, -6.40994161e-02,  5.71594685e-02,
         4.18905839e-02, -2.15708792e-01, -3.82482201e-01,
         2.77274251e-02,  4.89031762e-01, -2.59544224e-01,
         4.19734195e-02,  4.19351071e-01, -1.70691758e-01,
         3.49435896e-01,  5.07596731e-02,  1.39876038e-01,
         1.68827742e-01,  3.38076726e-02,  1.25220031e-01,
         1.57566637e-01, -4.62828875e-02,  3.88313353e-01,
        -4.73999202e-01, -2.53984213e+00, -2.49018148e-01,
        -4.09508765e-01, -3.13705504e-01,  3.55963051e-0