In [1]:
# DataFrame
import pandas as pd

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import logging
import pickle

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import scipy.sparse as sp
from tqdm import tqdm

In [11]:
# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

# # DATASET
# DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
# DATASET_ENCODING = "ISO-8859-1"
# TRAIN_SIZE = 0.8

# TEXT CLENAING
# urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
# userPattern = '@[^\s]+'
# punctuation = r'[^A-Za-z0-9]+'
PATTERN = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9']+"

# WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 60
EPOCHS = 10
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"
CLEANED_TRAIN_CSV = 'cleaned_train_ds.csv'
CLEANED_TEST_CSV = 'cleaned_test_ds.csv'
# DATASET_CSV = './training.1600000.processed.noemoticon.csv'
W2V_DICT_PKL = './w2v_dict.pkl'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maghsk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def replace_url(text, label):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, PATTERN, " ")
    return text, label

def preprocess(text, label):
    return ' '.join(stemmer.stem(token) for token in text.numpy().decode().split() if token not in stop_words) , label


def py_function_preprocess(text, label):
    return tf.py_function(preprocess, [text, label], [tf.string, tf.int32])

In [28]:
with tf.device("/cpu:0"):
    (train_ds, test_ds), ds_info = tfds.load(name='sentiment140', split=['train', 'test'], as_supervised=True, with_info=True)
    train_ds = train_ds.map(replace_url, num_parallel_calls=8).map(py_function_preprocess, num_parallel_calls=8).prefetch(buffer_size=tf.data.AUTOTUNE)
    test_ds = test_ds.map(replace_url, num_parallel_calls=8).map(py_function_preprocess, num_parallel_calls=8).prefetch(buffer_size=tf.data.AUTOTUNE)

2022-03-19 20:43:29,631 : INFO : Load dataset info from /Users/maghsk/tensorflow_datasets/sentiment140/1.0.0
2022-03-19 20:43:29,634 : INFO : Reusing dataset sentiment140 (/Users/maghsk/tensorflow_datasets/sentiment140/1.0.0)
2022-03-19 20:43:29,634 : INFO : Constructing tf.data.Dataset sentiment140 for split ['train', 'test'], from /Users/maghsk/tensorflow_datasets/sentiment140/1.0.0


In [26]:
# for i in train_ds.take(5):
#     print(i)

In [6]:
%%time
print("TRAIN size:", len(train_ds))
print("TEST size:", len(test_ds))

lines, labels = zip(*[x for x in tqdm(train_ds)])
lines = [x.numpy().decode() for x in lines]
labels = [x.numpy() for x in labels]

documents = [x.split() for x in lines]

w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE,
                                            window=W2V_WINDOW,
                                            min_count=W2V_MIN_COUNT,
                                            workers=8)

w2v_model.build_vocab(documents)

vocab_size = len(w2v_model.wv.index_to_key)
print("Vocab size", vocab_size)

TRAIN size: 1600000
TEST size: 498


  0%|          | 0/1600000 [00:00<?, ?it/s]2022-03-19 20:23:02.218310: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
100%|██████████| 1600000/1600000 [04:33<00:00, 5848.36it/s]
2022-03-19 20:27:45,044 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2022-03-19T20:27:45.043767', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:27:43) \n[Clang 11.1.0 ]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'created'}
2022-03-19 20:27:45,047 : INFO : collecting all words and their counts
2022-03-19 20:27:45,047 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-19 20:27:45,057 : INFO : PROGRESS: at sentence #10000, processed 74045 words, keeping 11439 word types
2022-03-19 20:27:45,066 : INFO : PROGRESS: at sentence #20000, processed 148364 words, keeping 17611 word types
2022-03-19 20:27:45,076 : INFO : PROGRESS: a

Vocab size 27085
CPU times: user 6min 41s, sys: 2min 58s, total: 9min 39s
Wall time: 4min 45s


In [7]:
%%time
w2v_model.train(documents, total_examples=len(train_ds), epochs=W2V_EPOCH)

w2v_dict = {k: w2v_model.wv[k] for k in w2v_model.wv.index_to_key}

pickle.dump(w2v_dict, open(W2V_DICT_PKL, 'wb'))

2022-03-19 20:27:47,209 : INFO : Word2Vec lifecycle event {'msg': 'training model with 8 workers on 27085 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7 shrink_windows=True', 'datetime': '2022-03-19T20:27:47.209855', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:27:43) \n[Clang 11.1.0 ]', 'platform': 'macOS-12.3-arm64-arm-64bit', 'event': 'train'}
2022-03-19 20:27:48,232 : INFO : EPOCH 1 - PROGRESS: at 17.12% examples, 1755792 words/s, in_qsize 14, out_qsize 1
2022-03-19 20:27:49,234 : INFO : EPOCH 1 - PROGRESS: at 33.17% examples, 1710860 words/s, in_qsize 15, out_qsize 0
2022-03-19 20:27:50,251 : INFO : EPOCH 1 - PROGRESS: at 46.93% examples, 1609325 words/s, in_qsize 14, out_qsize 1
2022-03-19 20:27:51,255 : INFO : EPOCH 1 - PROGRESS: at 59.74% examples, 1539391 words/s, in_qsize 15, out_qsize 0
2022-03-19 20:27:52,257 : INFO : EPOCH 1 - PROGRESS: at 76.20% examples, 1572999 words/s, in_qsize 15, out_qsize 0
20

CPU times: user 16min 16s, sys: 13.7 s, total: 16min 30s
Wall time: 3min 30s


In [42]:
test_lines, test_labels = zip(*[x for x in tqdm(test_ds)])
test_lines = [x.numpy().decode() for x in test_lines]
test_labels = [x.numpy() for x in test_labels]

100%|██████████| 498/498 [00:00<00:00, 20523.34it/s]


In [8]:
max(len(x) for x in documents)

50

In [9]:
max(len(tf.strings.split(x)) for x, y in test_ds)

20

In [116]:
final_dict = {k: v+1 for k,v in w2v_model.wv.key_to_index.items()}
pickle.dump(final_dict, open('sentiment140_final_dict.pkl', 'wb'))

In [117]:
not_exist_id = len(final_dict) + 1    # skip 0
vocab_size = not_exist_id + 1
assert not_exist_id not in list(final_dict.values())
print(not_exist_id)

27086


In [120]:
train_tokens = [[final_dict[token] if token in final_dict else not_exist_id for token in document] for document in documents]
test_tokens = [[final_dict[token] if token in final_dict else not_exist_id for token in document.split()] for document in test_lines]

In [121]:
len(train_tokens)

1600000

In [122]:
train_tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(train_tokens, maxlen=SEQUENCE_LENGTH)
test_tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(test_tokens, maxlen=SEQUENCE_LENGTH)

In [123]:
train_tokens_padded.nbytes

384000000

In [124]:
sp.save_npz('sentiment140_x_train.csr.npz', sp.csr_matrix(train_tokens_padded))
sp.save_npz('sentiment140_x_test.csr.npz', sp.csr_matrix(test_tokens_padded))

In [125]:
np.save('sentiment140_y_train.npy', np.array(labels, dtype=np.uint8))
np.save('sentiment140_y_test.npy', np.array(test_labels, dtype=np.uint8))

In [126]:
!ls -l *.np*

-rw-r--r--  1 maghsk  staff   30720128 Mar 19 10:45 cifar10_x_test.npy
-rw-r--r--  1 maghsk  staff  153600128 Mar 19 10:45 cifar10_x_train.npy
-rw-r--r--  1 maghsk  staff      10128 Mar 19 10:45 cifar10_y_test.npy
-rw-r--r--  1 maghsk  staff      50128 Mar 19 10:45 cifar10_y_train.npy
-rw-r--r--  1 maghsk  staff   65008928 Mar 19 21:42 sentiment140_embedding.npy
-rw-r--r--  1 maghsk  staff      10323 Mar 19 21:46 sentiment140_x_test.csr.npz
-rw-r--r--  1 maghsk  staff   26507784 Mar 19 21:46 sentiment140_x_train.csr.npz
-rw-r--r--  1 maghsk  staff        626 Mar 19 21:46 sentiment140_y_test.npy
-rw-r--r--  1 maghsk  staff    1600128 Mar 19 21:46 sentiment140_y_train.npy


In [127]:
tmp = sp.load_npz('sentiment140_x_train.csr.npz').toarray()
assert np.allclose(tmp, train_tokens_padded)
tmp = sp.load_npz('sentiment140_x_test.csr.npz').toarray()
assert np.allclose(tmp, test_tokens_padded)

In [130]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for k,v in final_dict.items():
    embedding_matrix[v] = w2v_dict[k]
print(embedding_matrix.shape, embedding_matrix.nbytes)

(27087, 300) 65008800


In [131]:
np.save('sentiment140_embedding.npy', embedding_matrix)