In [1]:
from keras import layers, models
from keras import initializers , preprocessing, utils
import numpy as np
import re
import math
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
# Alphabet
letters = "abcdefghijklmnopqrstuvwxyz"
numbers = "0123456789"
other_char = ",;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" # original: -,;.!?:’’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}, changed: -,;.!?:’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}
new_line = "\n"

alphabet = letters + numbers + other_char + new_line
print(alphabet)
print(len(alphabet))

abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'"/\|_@#$%^&*~`+-=<>()[]{}

69


In [3]:
# type of text
# __label__2 This is my first PDA/Organizer: I purchased this about 4 months ago and it really is easy to use especially if you are familiar with Microsoft Word and Excel. I use it to copy files from my PC that I want to have handy. The only thing I wish it had is a backlight. But that hasn't come into play too often. I usually have to charge it every 2 or 3 days. I recommend this product if you are not worried about having a lot accessories to go with it, because they are not that many available.
# __label__1 Piece of Crap: I have ordered thousands of items in my lifetime, and bar none, this is the biggest piece of crap I have ever received. It is supposed to come completely put together, but when it arrived, there were no less than 10 little parts that had come off the screen assembly and one screen that was loose and all of its parts dangling. The first thing that happenned is that I cut myself on one of the screens because the protective side had come off. I was bleeding for quite some time. The second thing I noticed was the extremely poor quality of the material and assembly. I am very handy by nature, but even I had to give up trying to put this piece of junk back together again. Plus it was not worth it, even if I had put it together again, all I would have then had was a piece of crap fireplace screen.
# data from  https://www.kaggle.com/bittlingmayer/amazonreviews

dir_name = './data/'
test_file_name = 'test.ft.txt'
train_file_name = 'train.ft.txt'


def getListOfLabelsAndLinesFromFile(file_name): # label 1 = 0, label 2 = 1
    
    labels = []
    lines = []
    total_labels = []
    count = 0
    file = open(file_name, 'r')
    for line in file:
        match = re.match('(__label__([0-9])) (.*)', line)
        if match:
            label = int(match.group(2)) - 1 # to make the label start in 0 (ZERO)
            text = match.group(3)
            if not label in total_labels: total_labels.append(label)
            labels.append(label)
            lines.append(text)
        else:
            print('---- ERROR ----')
        count += 1
    file.close()
    return { 'labels': labels, 'lines': lines, 'total_labels': total_labels, 'total_lines': count }

train_data_info = getListOfLabelsAndLinesFromFile(dir_name + train_file_name)
test_data_info = getListOfLabelsAndLinesFromFile(dir_name + test_file_name)

print(test_data_info['total_lines'])
print(train_data_info['total_lines'])
# ratio 9 

400000
3600000


In [4]:
# Constants

# Embedding
max_len_text = 1014
max_features_for_embedding = len(alphabet)
output_from_embedding = 128

# Large Model
initializer_large = { 'mean': 0.0, 'stddev': 0.02}
activation_conv = 'relu'
conv_output_dim = 256
pool_size = 3
kernel_size_7 = 7 # first_WITH_maxpool
kernel_size_3 = 3 # intermediate_NO_maxpool

# Small model
initializer_small = { 'mean': 0.0, 'stddev': 0.05}
activation_dens = 'relu'
dropout_rate = 0.5

In [5]:
# Create Model
# https://medium.com/@romannempyre/sentiment-analysis-using-1d-convolutional-neural-networks-part-1-f8b6316489a2
# https://missinglink.ai/guides/deep-learning-frameworks/keras-conv1d-working-1d-convolutional-neural-networks-keras/
# https://blog.goodaudience.com/introduction-to-1d-convolutional-neural-networks-in-keras-for-time-sequences-3a7ff801a2cf?gi=5c4a324fc922
# https://medium.com/@bramblexu/character-level-cnn-with-keras-50391c3adf33

model = models.Sequential() 

# AS the one hot approach, leave us with a very sparse and high dimensional matrix, we apply embedding
# We specify the maximum input length to our Embedding layer
# so we can later flatten the embedded inputs
# Embedding ( 
#    max_features/input_dim = we got just 70 different types of inputs / Size of the vocabulary, 
#    output_dim = learn N dimensional embeddings for each of the input_dim words/characters,
#    input_length needed to flatten = cut the words/characters of each element to that lenght so the max
#                                     qty of elements in each phrase will be that one
# )    
model.add(layers.Embedding(max_features_for_embedding, output_from_embedding, input_length=max_len_text))
# After the Embedding layer,
# our activations have shape `(samples, maxlen, 8)`.
# Output: (batch dimension, input_length, output_dim)

# Convolutional Layer 1D
# In text 
# Conv1D(
#    filters = dimensionality of the output space
#    kernel_size = window_size
#    stride = position jumps of the window (defaults to 1)
# )
initializer_large_model = initializers.RandomNormal(mean=0.0, stddev=0.02, seed=None)
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_7, activation=activation_conv, kernel_initializer=initializer_large_model))

# Max Pooling 1D
# MaxPool1D(
#    pool_size = window size
#    strides = by default equals pool_size, so each window doesn't overlap
# )
model.add(layers.MaxPool1D(pool_size=pool_size))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_7, activation=activation_conv))
model.add(layers.MaxPool1D(pool_size=pool_size))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.MaxPool1D(pool_size=pool_size))

# Second part, full connected layers
initializer_small_model = initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None)
model.add(layers.Flatten())
model.add(layers.Dense(1024, activation=activation_dens, kernel_initializer=initializer_small_model))
model.add(layers.Dropout(dropout_rate))
model.add(layers.Dense(1024, activation=activation_dens))
model.add(layers.Dropout(dropout_rate))

# Last layer, according to problem to solve
# (just have 2 classes)
# But, it has to be though a Dense(2,...) , with Dense(1,...) not working so neither do activation "sigmoid"
model.add(layers.Dense(2, activation='softmax')) 

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1014, 128)         8832      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1008, 256)         229632    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 336, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 110, 256)          0         
_________________________________________________________________
conv1d_3 (Conv1D)    

In [6]:
# add optimizer and loss function and metrics to return 
# https://medium.com/octavian-ai/which-optimizer-and-learning-rate-should-i-use-for-deep-learning-5acb418f9b2
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [7]:
def getAlphabetDict(alpha):
    d = {}; c = 1
    for a in alpha: d[a] = c; c += 1
    return d

In [8]:
def getDataAndClasses(tokenizer, info, alpha, max_len_text, batch_len = None, from_i = 0):
        
    lines = info['lines']
    labels = info['labels']
    if batch_len != None: 
        lines = lines[from_i:(from_i + batch_len)]
        labels = labels[from_i:(from_i + batch_len)]
    
    sequences = tokenizer.texts_to_sequences(lines)
    """EX: [[15, 9, 2, 5, 3, 1, 13, 12, 28, 1, 14, 17 ...""" # Each, number represents a letter in the alphabet
    data = preprocessing.sequence.pad_sequences(sequences, maxlen=max_len_text, padding='post')
    # Data of shape (quantity of info (sentences), max_len_text)
    """EX: array([[15,  9,  2, ...,  0,  0,  0],
       [ 6,  8,  2, ...,  0,  0,  0],
       [21,  5,  3, ...,  0,  0,  0]], dtype=int32)"""  
    
    classes = utils.to_categorical(labels, num_classes=len(info['total_labels']))
    """EX: array([[0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)"""
    
    return {'data': data, 'classes': classes}

In [9]:
hist_arr = []
def runByBatches(model, train_info, test_info, alpha, max_len_text, batch_size, test_batch_size, until = None):
    
    # Preprocess Data 
    tokenizer = preprocessing.text.Tokenizer(char_level=True, lower=True, filters=alpha) # , oov_token= ?  
    tokenizer.word_index = getAlphabetDict(alpha)
    """EX: {'a': 1,
         'b': 2,
         'c': 3 ... """
    
    # After each epoch , the test data is validated
    # test_data = getDataAndClasses(tokenizer, test_info, alpha, max_len_text)
    test_data = getDataAndClasses(tokenizer, test_info, alpha, max_len_text, test_batch_size,0)
        
    total_train_len = train_info['total_lines']
    total_tst_len = test_info['total_lines']
    count = count_tst = 0
    
    while count < total_train_len and (until != None and count < until):
        
        # Get elements
        size = batch_size if batch_size < (total_train_len - count) else total_train_len - count
        size_tst = test_batch_size if test_batch_size < (total_tst_len - count_tst) else total_tst_len - count_tst
        train_data = getDataAndClasses(tokenizer, train_info, alpha, max_len_text, size, count)
        #test_data = getDataAndClasses(tokenizer, test_info, alpha, max_len_text, size_tst, count_tst)
        print(f'-----  doing from trining/test: {count} - {size + count}') # / {count_tst} - {size_tst}')
        count += batch_size   
        count_tst += test_batch_size
        
        history = model.fit(train_data['data'], 
                            train_data['classes'],
                            validation_data=(test_data['data'], test_data['classes']),
                            epochs=4, # 5000
                            batch_size=128,
                            verbose=2
                           )
        hist_arr.append(history.history)
        
        train_data = None
        #test_data = None

In [10]:
def getWindowOfBatchInArray(a, from_i, to_i):
    tot_len = len(a)
    until_pos = from_i + to_i
    remaining = 0
    if until_pos > tot_len: 
        remaining = until_pos - tot_len 
        return a[from_i:(tot_len + 1)]+a[0:remaining]
    else:
        return a[from_i:to_i]

In [11]:
test_batch_size = math.ceil(test_data_info['total_lines']/\
    (train_data_info['total_lines']/test_data_info['total_lines']))
test_size = 40000
train_size = 40000
n = 10
runByBatches(model, 
            train_data_info, 
            test_data_info, 
            alphabet, 
            max_len_text, 
            train_size, #test_data_info['total_lines'],
            test_size, # test batch size
            train_size * n # until n chunks of data (n repetitions)
           )


-----  doing from trining/test: 0 - 40000
Instructions for updating:
Use tf.cast instead.
Train on 40000 samples, validate on 40000 samples
Epoch 1/4
 - 402s - loss: 0.6303 - acc: 0.6198 - val_loss: 0.4988 - val_acc: 0.7622
Epoch 2/4
 - 362s - loss: 0.4479 - acc: 0.7941 - val_loss: 0.4718 - val_acc: 0.7666
Epoch 3/4
 - 377s - loss: 0.3468 - acc: 0.8493 - val_loss: 0.3554 - val_acc: 0.8433
Epoch 4/4
 - 368s - loss: 0.2731 - acc: 0.8875 - val_loss: 0.3206 - val_acc: 0.8631
-----  doing from trining/test: 40000 - 80000
Train on 40000 samples, validate on 40000 samples
Epoch 1/4
 - 372s - loss: 0.3139 - acc: 0.8672 - val_loss: 0.3053 - val_acc: 0.8682
Epoch 2/4
 - 387s - loss: 0.2446 - acc: 0.9004 - val_loss: 0.3084 - val_acc: 0.8773
Epoch 3/4
 - 372s - loss: 0.1932 - acc: 0.9241 - val_loss: 0.3088 - val_acc: 0.8762
Epoch 4/4
 - 382s - loss: 0.1440 - acc: 0.9444 - val_loss: 0.3359 - val_acc: 0.8724
-----  doing from trining/test: 80000 - 120000
Train on 40000 samples, validate on 40000 sam