In [18]:
from keras import layers, models
from keras import initializers , preprocessing
import numpy as np
import re

In [4]:
# Alphabet

letters = "abcdefghijklmnopqrstuvwxyz"
numbers = "0123456789"
other_char = "-,;.!?:’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}" # original: -,;.!?:’’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}
new_line = "\n"

final_string = letters + numbers + other_char + new_line
print(final_string)
print(len(final_string))

abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}

70


In [5]:
# Get text

text = "hello\nallo"

In [19]:
# Character Quantization

# [70 rows (the alphabet), text lenght columns]

def quantize_text(alphabet, text):
    alph_len = len(alphabet)
    results = np.zeros((alph_len, len(text)))
    for i, char in enumerate(text):
        if char in alphabet:
            results[alphabet.index(char), i] = 1
    return results

quantized_text = quantize_text(final_string, text)

In [20]:
print(quantized_text[0])
print(quantized_text[1])

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [12]:
# type of text
# __label__2 This is my first PDA/Organizer: I purchased this about 4 months ago and it really is easy to use especially if you are familiar with Microsoft Word and Excel. I use it to copy files from my PC that I want to have handy. The only thing I wish it had is a backlight. But that hasn't come into play too often. I usually have to charge it every 2 or 3 days. I recommend this product if you are not worried about having a lot accessories to go with it, because they are not that many available.
# __label__1 Piece of Crap: I have ordered thousands of items in my lifetime, and bar none, this is the biggest piece of crap I have ever received. It is supposed to come completely put together, but when it arrived, there were no less than 10 little parts that had come off the screen assembly and one screen that was loose and all of its parts dangling. The first thing that happenned is that I cut myself on one of the screens because the protective side had come off. I was bleeding for quite some time. The second thing I noticed was the extremely poor quality of the material and assembly. I am very handy by nature, but even I had to give up trying to put this piece of junk back together again. Plus it was not worth it, even if I had put it together again, all I would have then had was a piece of crap fireplace screen.
# data from  https://www.kaggle.com/bittlingmayer/amazonreviews

dir_name = './data/'
test_file_name = 'test.ft.txt'
train_file_name = 'train.ft.txt'

def getListOfLabelsAndLinesFromFile(file_name): # label 1 = 0, label 2 = 1
    
    labels = []
    lines = []
    total_labels = []
    count = 0
    file = open(file_name, 'r')
    for line in file:
        match = re.match('(__label__([0-9])) (.*)', line)
        if match:
            label = match.group(2)
            text = match.group(3)
            if not label in total_labels: total_labels.append(label)
            labels.append(label - 1)
            lines.append(text)
        else:
            print('---- ERROR ----')
        count += 1
    file.close()
    return { 'labels': labels, 'lines': lines, 'total_labels': total_labels, 'total_lines': count }

train_data_info = getListOfLabelsAndLinesFromFile(dir_name + train_file_name)
test_data_info = getListOfLabelsAndLinesFromFile(dir_name + test_file_name)

In [15]:
# Create Model
# https://medium.com/@romannempyre/sentiment-analysis-using-1d-convolutional-neural-networks-part-1-f8b6316489a2
# https://github.com/chaitjo/character-level-cnn
# https://missinglink.ai/guides/deep-learning-frameworks/keras-conv1d-working-1d-convolutional-neural-networks-keras/
# https://blog.goodaudience.com/introduction-to-1d-convolutional-neural-networks-in-keras-for-time-sequences-3a7ff801a2cf?gi=5c4a324fc922

# Embedding
max_len_text = 1014
max_features_for_embedding = 70 
output_from_embedding = 128

# Large Model
initializer_large = { 'mean': 0.0, 'stddev': 0.02}
activation_conv = 'relu'
conv_output_dim = 256
pool_size = 3
kernel_size_7 = 7 # first_WITH_maxpool
kernel_size_3 = 3 # intermediate_NO_maxpool

# Small model
initializer_small = { 'mean': 0.0, 'stddev': 0.05}
activation_dens = 'relu'
dropout_rate = 0.5

model = models.Sequential() 

# AS the one hot approach, leave us with a very sparse and high dimensional matrix, we apply embedding
# We specify the maximum input length to our Embedding layer
# so we can later flatten the embedded inputs
# Embedding ( 
#    max_features/input_dim = we got just 70 different types of inputs / Size of the vocabulary, 
#    output_dim = learn N dimensional embeddings for each of the input_dim words/characters,
#    input_length needed to flatten = cut the words/characters of each element to that lenght so the max
#                                     qty of elements in each phrase will be that one
# )    
model.add(layers.Embedding(max_features_for_embedding, output_from_embedding, input_length=max_len_text))
# After the Embedding layer,
# our activations have shape `(samples, maxlen, 8)`.
# Output: (batch dimension, input_length, output_dim)

# Convolutional Layer 1D
# In text 
# Conv1D(
#    filters = dimensionality of the output space
#    kernel_size = window_size
#    stride = position jumps of the window (defaults to 1)
# )
initializer_large_model = initializers.RandomNormal(mean=0.0, stddev=0.02, seed=None)
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_7, activation=activation_conv, kernel_initializer=initializer_large_model))

# Max Pooling 1D
# MaxPool1D(
#    pool_size = window size
#    strides = by default equals pool_size, so each window doesn't overlap
# )
model.add(layers.MaxPool1D(pool_size=pool_size))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_7, activation=activation_conv))
model.add(layers.MaxPool1D(pool_size=pool_size))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.Conv1D(filters=conv_output_dim, kernel_size=kernel_size_3, activation=activation_conv))
model.add(layers.MaxPool1D(pool_size=pool_size))

# Second part, full connected layers
initializer_large_model = initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None)
model.add(layers.Flatten())
model.add(layers.Dense(1024, activation=activation_dens, kernel_initializer=initializer_large_model))
model.add(layers.Dropout(dropout_rate))
model.add(layers.Dense(1024, activation=activation_dens))
model.add(layers.Dropout(dropout_rate))

# Last layer, according to problem to solve
# (just have 2 classes)
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1014, 128)         8960      
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 1008, 256)         229632    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 336, 256)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 110, 256)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 108, 256)          196864    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 106, 256)          196864    
__________

In [17]:
# add optimizer and loss function and metrics to return 
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1014, 128)         8960      
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 1008, 256)         229632    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 336, 256)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 110, 256)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 108, 256)          196864    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 106, 256)          196864    
__________

In [23]:
# treat the data before passing it to the model
# train_data_info 
# test_data_info 
# { 'labels': , 'lines': , 'total_labels': , 'total_lines':  }
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_data_info['lines'])

print(tokenizer)
sequences = tokenizer.texts_to_sequences(train_data_info['lines'])
print(sequences)

#one_hot_results = tokenizer.texts_to_matrix(train_data_info['lines'], mode='binary')

KeyboardInterrupt: 

In [None]:
# run the model
history = model.fit()