In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Dataset  
For the dataset, we use [Sentiment140](http://help.sentiment140.com/for-students)  
And we will build a simple model that decides positive and negative tweets

In [3]:
# download dataset
import zipfile
import urllib.request
if not os.path.exists("dataset"):
    os.makedirs("dataset")
if not os.path.exists(os.path.join("dataset", "sentiment140")):
    urllib.request.urlretrieve("http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip", os.path.join("dataset", "sentiment140.zip"))
    with zipfile.ZipFile(os.path.join("dataset", "sentiment140.zip"), 'r') as inFile:
        inFile.extractall(os.path.join("dataset", "sentiment140"))
    os.remove(os.path.join("dataset", "sentiment140.zip"))

In [4]:
FILE_PATH_TRAINING = os.path.join("dataset", "sentiment140", "training.1600000.processed.noemoticon.csv")
FILE_PATH_PROCESSED = os.path.join("dataset", "sentiment140", "processed.csv")
FILE_PATH_TEST_NO_USE = os.path.join("dataset", "sentiment140", "testdata.manual.2009.06.14.csv")

## Preprocessing  
1. remove extra space, characters, and process @words  
2. pad the sentences  

In [5]:
if not os.path.exists(FILE_PATH_PROCESSED):
    tmp = pd.read_csv(FILE_PATH_TRAINING, names=["Target", "ID", "Date", "QueryInfo", "UserName", "Text"], encoding="latin-1")
    tmp.to_csv(FILE_PATH_PROCESSED, encoding="utf-8", index=False)

In [6]:
training_data = pd.read_csv(FILE_PATH_PROCESSED, encoding="utf-8")

In [7]:
# select the needed columns
X_train = training_data["Text"].copy()
y_train = training_data["Target"].copy()
print(X_train.shape, y_train.shape)

(1600000,) (1600000,)


In [8]:
X_train.head(10)

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
5                        @Kwesidei not the whole crew 
6                                          Need a hug 
7    @LOLTrish hey  long time no see! Yes.. Rains a...
8                 @Tatiana_K nope they didn't have it 
9                            @twittera que me muera ? 
Name: Text, dtype: object

In [9]:
y_train.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: Target, dtype: int64

In [10]:
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
ch_range = list(range(97, 123)) + list(range(65, 91)) + [ord(' '), ord('\'')]
def process_str(raw_string):
    global ch_range
    global stemmer
    # first remove url, @username, etc
    raw_string = re.sub(r"@([A-Z]|[a-z]|[0-9]|_)+", "", raw_string)
    raw_string = re.sub(r"(http|https)://([A-Z]|[a-z]|[0-9]|/|\.)+", "", raw_string)
    # remove characters other than [a-z][A-Z][0-9]['!?] or empty space
    new_string = "".join([ch.lower() if ord(ch) in ch_range else ' ' for ch in list(raw_string)])
    # remove extra space, and also convert plural form to singular
    new_string = new_string.strip()
    new_string = " ".join([stemmer.stem(word) for word in new_string.split()])
    return new_string

In [11]:
X_train_processed = X_train.apply(process_str)
X_train_processed.head(10)

0    awww that a bummer you shoulda got david carr ...
1    is upset that he can't updat his facebook by t...
2    i dive mani time for the ball manag to save th...
3         my whole bodi feel itchi and like it on fire
4    no it not behav at all i'm mad whi am i here b...
5                                   not the whole crew
6                                           need a hug
7    hey long time no see yes rain a bit onli a bit...
8                             nope they didn't have it
9                                         que me muera
Name: Text, dtype: object

In [12]:
# remove empty string rows
index_to_remove = X_train_processed[X_train_processed == ""].index
X_train_processed = X_train_processed.drop(index_to_remove)
y_train = y_train.drop(index_to_remove)
X_train_processed = X_train_processed.reset_index()
y_train = y_train.reset_index()

In [13]:
X_train_processed.head(10)

Unnamed: 0,index,Text
0,0,awww that a bummer you shoulda got david carr ...
1,1,is upset that he can't updat his facebook by t...
2,2,i dive mani time for the ball manag to save th...
3,3,my whole bodi feel itchi and like it on fire
4,4,no it not behav at all i'm mad whi am i here b...
5,5,not the whole crew
6,6,need a hug
7,7,hey long time no see yes rain a bit onli a bit...
8,8,nope they didn't have it
9,9,que me muera


In [14]:
y_train.head(10)

Unnamed: 0,index,Target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [15]:
X_train_processed = X_train_processed["Text"]
y_train = y_train["Target"]

In [16]:
processed_data = pd.DataFrame(np.array([y_train, X_train_processed]).T, columns=["Target", "Text"])
processed_data.to_csv(os.path.join("dataset", "sentiment140", "final.csv"), index=False)

In [17]:
PAD_MAXLEN = 80
TOKENIZE_MAXWORDS = 30
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=TOKENIZE_MAXWORDS)
tokenizer.fit_on_texts(X_train_processed)
X_train_seq = tokenizer.texts_to_sequences(X_train_processed)
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, padding="post", maxlen=PAD_MAXLEN)

In [18]:
TOKEN_VOCAB_SIZE = len(tokenizer.word_index) + 1
TOKEN_VOCAB_SIZE

225642

In [19]:
X_train_pad.shape

(1596353, 80)

In [20]:
X_train_pad[:2]

array([[13,  4,  8, 12, 26,  2,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9, 13,  5,  7,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [21]:
y_train_processed = y_train.replace(4, 1).to_numpy().ravel()
y_train_processed.shape

(1596353,)

In [22]:
# shuffle the dataset once to prepare for training
index_permut = np.random.permutation(len(y_train_processed))
X_train_final = np.array(X_train_pad)[index_permut]
y_train_final = np.array(y_train_processed)[index_permut]
print(X_train_final[:5])
print(y_train_final[:5])

[[ 9 14 11 19 16 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1 25 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 1  5  1 29  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [ 4 26  4 12  4 19 23  5 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]
 [24

In [23]:
with open(os.path.join("dataset", "sentiment140", "data.pickle"), "wb") as outFile:
    pickle.dump([X_train_final, y_train_final, tokenizer], outFile)