In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Dataset  
For the dataset, we use [Sentiment140](http://help.sentiment140.com/for-students)  
And we will build a simple model that decides positive and negative tweets

In [3]:
# download dataset
import zipfile
import urllib.request
if not os.path.exists("dataset"):
    os.makedirs("dataset")
if not os.path.exists(os.path.join("dataset", "sentiment140")):
    urllib.request.urlretrieve("http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip", os.path.join("dataset", "sentiment140.zip"))
    with zipfile.ZipFile(os.path.join("dataset", "sentiment140.zip"), 'r') as inFile:
        inFile.extractall(os.path.join("dataset", "sentiment140"))
    os.remove(os.path.join("dataset", "sentiment140.zip"))

In [4]:
FILE_PATH_TRAINING = os.path.join("dataset", "sentiment140", "training.1600000.processed.noemoticon.csv")
FILE_PATH_PROCESSED = os.path.join("dataset", "sentiment140", "processed.csv")
FILE_PATH_TEST_NO_USE = os.path.join("dataset", "sentiment140", "testdata.manual.2009.06.14.csv")

## Preprocessing  
1. remove extra space, characters, and process @words  
2. pad the sentences  

In [5]:
if not os.path.exists(FILE_PATH_PROCESSED):
    tmp = pd.read_csv(FILE_PATH_TRAINING, names=["Target", "ID", "Date", "QueryInfo", "UserName", "Text"], encoding="latin-1")
    tmp.to_csv(FILE_PATH_PROCESSED, encoding="utf-8", index=False)

In [6]:
training_data = pd.read_csv(FILE_PATH_PROCESSED, encoding="utf-8")
training_data = training_data[["Target", "Text"]]

In [7]:
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
ch_range = list(range(97, 123)) + list(range(65, 91)) + [ord(' '), ord('\'')]
def process_str(raw_string):
    global ch_range
    global stemmer
    # first remove url, @username, etc
    raw_string = re.sub(r"(@|#)([A-Z]|[a-z]|[0-9]|_)+", "", raw_string)
    raw_string = re.sub(r"(http|https)://([A-Z]|[a-z]|[0-9]|/|\.)+", "", raw_string)
    # remove characters other than [a-z][A-Z][0-9]['!?] or empty space
    new_string = "".join([ch.lower() if ord(ch) in ch_range else ' ' for ch in list(raw_string)])
    # remove extra space, and also convert plural form to singular
    new_string = new_string.strip()
    new_string = " ".join([stemmer.stem(word) for word in new_string.split()])
    return new_string

In [8]:
training_data["Text"] = training_data["Text"].apply(process_str)
training_data["Text"].head(10)

0    awww that a bummer you shoulda got david carr ...
1    is upset that he can't updat his facebook by t...
2    i dive mani time for the ball manag to save th...
3         my whole bodi feel itchi and like it on fire
4    no it not behav at all i'm mad whi am i here b...
5                                   not the whole crew
6                                           need a hug
7    hey long time no see yes rain a bit onli a bit...
8                             nope they didn't have it
9                                         que me muera
Name: Text, dtype: object

In [9]:
# remove empty string rows, or with only one word
training_data.drop(training_data["Text"][training_data["Text"] == ""].index, inplace=True)
training_data = training_data[training_data["Text"].str.contains(" ")]
training_data.drop_duplicates(inplace=True)
training_data.reset_index(drop=True)
training_data.head(10)

Unnamed: 0,Target,Text
0,0,awww that a bummer you shoulda got david carr ...
1,0,is upset that he can't updat his facebook by t...
2,0,i dive mani time for the ball manag to save th...
3,0,my whole bodi feel itchi and like it on fire
4,0,no it not behav at all i'm mad whi am i here b...
5,0,not the whole crew
6,0,need a hug
7,0,hey long time no see yes rain a bit onli a bit...
8,0,nope they didn't have it
9,0,que me muera


In [10]:
training_data.to_csv(os.path.join("dataset", "sentiment140", "final.csv"), index=False)

In [11]:
X_train_processed = training_data["Text"].copy()
y_train = training_data["Target"].copy()

In [12]:
PAD_MAXLEN = 45
MAX_FEATURES = 20000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(X_train_processed)
X_train_seq = tokenizer.texts_to_sequences(X_train_processed)
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, padding="post", maxlen=PAD_MAXLEN)

In [13]:
TOKEN_VOCAB_SIZE = len(tokenizer.word_index) + 1
TOKEN_VOCAB_SIZE

219141

In [14]:
X_train_pad.shape

(1516624, 45)

In [15]:
X_train_pad[:2]

array([[ 481,   13,    4, 1175,    8, 3114,   46,  827, 7365,   12, 1772,
          27,    2,   35,    5,  317,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [   9,  711,   13,   91,   72,  321,  192,  550,  129,  465,    5,
           7,  301,  339,   87,    4, 1094,  150,   42,  275, 1097,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]])

In [16]:
y_train_processed = y_train.replace(4, 1).to_numpy().ravel()
y_train_processed.shape

(1516624,)

In [17]:
# shuffle the dataset once to prepare for training
index_permut = np.random.permutation(len(y_train_processed))
X_train_final = np.array(X_train_pad)[index_permut]
y_train_final = np.array(y_train_processed)[index_permut]
print(X_train_final[:5])
print(y_train_final[:5])

[[   22   444  1586    30   151    52   562    14  1632   129  2052  2773
    481     1    55   155     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [  159   225  1007   158    75   325   178    10  5130     1  1078    49
     45     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [   20   275   217  3913   125    87     1    15   206    12    33     2
     35    34     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [   17    49    89   113    13   249    27   280   359     6   878    88
     85    52  5561     7   318    87    91    85    11   113   144    20
    191    10  

In [18]:
with open(os.path.join("dataset", "sentiment140", "data.pickle"), "wb") as outFile:
    pickle.dump([X_train_final, y_train_final, tokenizer], outFile)