In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Load Data

In [3]:
with open(os.path.join("dataset", "sentiment140", "data.pickle"), "rb") as inFile:
    data = pickle.load(inFile)
X_train = data[0]
y_train = data[1]
tokenizer = data[2]

In [4]:
PAD_MAXLEN = 45
MAX_FEATURES = 20000

In [5]:
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
ch_range = list(range(97, 123)) + list(range(65, 91)) + [ord(' '), ord('\'')]
def process_str(raw_string):
    global ch_range
    global stemmer
    # first remove url, @username, etc
    raw_string = re.sub(r"@([A-Z]|[a-z]|[0-9]|_)+", "", raw_string)
    raw_string = re.sub(r"(http|https)://([A-Z]|[a-z]|[0-9]|/|\.)+", "", raw_string)
    # remove characters other than [a-z][A-Z][0-9]['!?] or empty space
    new_string = "".join([ch.lower() if ord(ch) in ch_range else ' ' for ch in list(raw_string)])
    # remove extra space, and also convert plural form to singular
    new_string = new_string.strip()
    new_string = " ".join([stemmer.stem(word) for word in new_string.split()])
    return new_string

## Get Mini Data for Experiment

In [6]:
from sklearn.model_selection import train_test_split
_, X_train_mini, _, y_train_mini = train_test_split(X_train, y_train, test_size=0.1)
print(X_train_mini.shape, y_train_mini.shape)

(159636, 45) (159636,)


## Create Model

In [16]:
tf.keras.backend.clear_session()

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(MAX_FEATURES, 128, input_length=PAD_MAXLEN))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Conv1D(256, 5, padding='valid', activation='relu', strides=1))
#model.add(tf.keras.layers.MaxPooling1D(pool_size=4))
model.add(tf.keras.layers.GlobalMaxPooling1D())
#model.add(tf.keras.layers.LSTM(70))
model.add(tf.keras.layers.Dense(200, activation="relu"))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              #optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
              metrics=['accuracy'])
model.summary()

model.fit(X_train, y_train, batch_size=64, epochs=5, validation_split=0.4, verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 45, 128)           2560000   
_________________________________________________________________
dropout (Dropout)            (None, 45, 128)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 41, 256)           164096    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 200)               51400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2

<tensorflow.python.keras.callbacks.History at 0x19f96a01c18>

In [17]:
if not os.path.exists("models"):
    os.makedirs("models")
model.save(os.path.join("models", "cnn.h5"), save_format="tf")