https://www.nltk.org/api/nltk.tokenize.treebank.html

In [None]:
# from gensim.models import KeyedVectors

# # Replace 'path/to/word2vec/model.bin' with the path to your pretrained Word2Vec model
# model_path = 'path/to/word2vec/model.bin'
# word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

# # Access the vector for a specific word
# vector_for_word = word2vec_model['example']


In [1]:
import re
import pandas as pd

import plotly.express as px

import matplotlib.pyplot as plt
import numpy as np
import fasttext
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')


import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from highlight_text import HighlightText, ax_text, fig_text

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report


import nltk
from nltk.tokenize import TreebankWordTokenizer

# from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split

# Data preparation

## Tokenize sentence and aspect BIO encoding class

In [2]:
class SentenceToken:
  '''
    SentenceToken

  '''
  def __init__(self, sentence, aspect_type=None, aspects=None, sentence_id=None):
    
    if sentence_id is not None:
      print(sentence_id)

    self.sentence_id = sentence_id
    self.sentence = sentence.replace(u"\u00A0", " ")
                            
    self.aspect_bio_tags = None
    self.unified_aspect_bio_tags = None
    self.token_span = None
    self.space_pre_token = None

    # Tokenize sentence
    self.__tokenize_sentence(self.sentence)

    if aspect_type == 'dict':
      self.set_aspect_tagging_from_dict(aspects)
    elif aspect_type == 'bio':
      self.set_aspect_bio_tags(aspects)
    elif aspect_type == 'unified bio':
      self.set_aspect_unified_bio_tags(aspects)
  
  def __tokenize_sentence(self, sentence):
    # self.sentence = sentence
    
    token_span = list(TreebankWordTokenizer().span_tokenize(sentence))

    new_token_span = []
    
    for k in token_span:
      token_start = k[0]
      token_end = k[1]

      token = sentence[token_start:token_end]
      sub_tokens = re.split(r'([^\w,\d])', token)
      
      sub_token_start = token_start
      for sub_token in sub_tokens:
        if len(sub_token) != 0:
          sub_token_end = sub_token_start + len(sub_token)
          new_token_span.append((sub_token_start, sub_token_end))
          sub_token_start = sub_token_end
    
    self.token_span = new_token_span
    self.space_pre_token = [True if sentence[k[0]-1:k[0]] == ' ' else False for i,k in enumerate(new_token_span)]


  def set_aspect_tagging_from_dict(self, aspects):
    polarity_map = {'positive':'POS'
              ,'negative': 'NEG'
              ,'conflict': 'CON'
              ,'neutral': 'NEU'}
    
    bio_tags = ['O'] * len(self.token_span)
    unified_bio_tags = bio_tags

    for x in aspects:
      if x['term'] != '':
        aspect_from = int(x['from'])
        aspect_to = int(x['to'])
        polarity = '-' + polarity_map[x['polarity']]
        aspect_token_ids =  [i for i, v in enumerate(self.token_span) if (v[0] >= aspect_from) & (v[1] <= aspect_to)]
        aspect_from_index = min(aspect_token_ids)
        aspect_to_index = max(aspect_token_ids)
        aspect_length = aspect_to_index - aspect_from_index
        bio_tags = bio_tags[:aspect_from_index] + ['B'] + ['I'] * (aspect_length) + bio_tags[aspect_to_index+1:]
        unified_bio_tags = unified_bio_tags[:aspect_from_index] + ['B' + polarity] + ['I'+ polarity] * (aspect_length) + unified_bio_tags[aspect_to_index+1:]
    
    self.set_aspect_bio_tags(bio_tags)
    self.set_aspect_unified_bio_tags(unified_bio_tags)

  def rebuild_sentence_from_token(self):
    return ''.join([(' ' if self.space_pre_token[i] else '') + self.sentence[k[0]:k[1]] for i, k in enumerate(self.token_span)])
  
  def get_sentence_token_with_aspect_bio_tag(self, unified_bio_tag=False):
    if (unified_bio_tag == False) & (self.aspect_bio_tags is None):
      raise Exception('No BIO tags provided. Use "SentenceToken.set_aspect_bio_tags()" method to add bio_tags')
    elif (unified_bio_tag == True) & (self.aspect_unified_bio_tags is None):
      raise Exception('No Unified BIO tags provided. Use "SentenceToken.set_aspect_unified_bio_tags()" method to add unified_bio_tags')
    else:
      return [(self.sentence[k[0]:k[1]], self.aspect_unified_bio_tags[i] if unified_bio_tag else self.aspect_bio_tags[i]) for i, k in enumerate(self.token_span)]

  def set_aspect_bio_tags(self, aspect_bio_tags):
    self.aspect_bio_tags = aspect_bio_tags
    self.aspect_unified_bio_tags = aspect_bio_tags

  def set_aspect_unified_bio_tags(self, aspect_unified_bio_tags):
    self.aspect_unified_bio_tags = aspect_unified_bio_tags
    self.aspect_bio_tags = [k[0:1] for k in aspect_unified_bio_tags]

  def check_rebuild_sentence_from_token(self):
    return re.sub(r'\s+', ' ',self.sentence.strip()) == self.rebuild_sentence_from_token().strip()
  
  def get_tokens(self):
    return [self.sentence[k[0]:k[1]] for k in self.token_span]

  def check_rebuild_aspect_terms(self, aspect_dict):
    aspect_dict = sorted(aspect_dict, key=lambda d: int(d['from']))
    aspect_input = ', '.join([k['term'] for k in aspect_dict])
    aspect_computed = ''.join([(', ' if k == 'B' else '') + self.sentence[self.token_span[i][0]:self.token_span[i][1]] for i,k in enumerate(self.aspect_bio_tags) if k in ['B','I'] ])[2:]
    
    return (aspect_input == aspect_computed, aspect_input, aspect_computed)

  def __str__(self):
    return self.rebuild_sentence_from_token()


In [3]:
df_train = pd.read_json('data/laptop/train.json')
# df_train.set_index('id', inplace=True).reset_index()
print('df_train shape: ', df_train.shape)

df_val = pd.read_json('data/laptop/validate.json') # This will only be used for the very last step to evaluate how well the model is, but is input now for validating the BIO tagging to ensure the function works properly
# df_val.set_index('id', inplace=True).reset_index()
print('df_val shape: ', df_val.shape)

# First, I will need to drop some duplicated data in our training dataset, as identified in the EDA process.
df_train.drop_duplicates(subset='text', inplace=True)

# We have removed 12 duplicated records in our training dataset
print(df_train.shape)

df_train shape:  (3048, 3)
df_val shape:  (800, 3)
(3036, 3)


In [4]:
df_train['sentence_token'] = df_train.apply(lambda x: SentenceToken(x['text'], 'dict', x['aspects']), axis=1)
df_train['sentence_check'] = df_train.apply(lambda x: x['sentence_token'].check_rebuild_sentence_from_token(), axis=1)
df_train['aspect_check'] = df_train.apply(lambda x: x['sentence_token'].check_rebuild_aspect_terms(x['aspects']), axis=1)

df_val['sentence_token'] = df_val.apply(lambda x: SentenceToken(x['text'], 'dict', x['aspects']), axis=1)
df_val['sentence_check'] = df_val.apply(lambda x: x['sentence_token'].check_rebuild_sentence_from_token(), axis=1)
df_val['aspect_check'] = df_val.apply(lambda x: x['sentence_token'].check_rebuild_aspect_terms(x['aspects']), axis=1)

In [5]:
print('# of df_train records having tokenizing issues: ', len(df_train[df_train['sentence_check']==False]))
print('# of df_train records having aspect bio tagging issues: ', len(df_train[df_train['aspect_check']==False]))
print('# of df_val records having tokenizing issues: ', len(df_val[df_val['sentence_check']==False]))
print('# of df_val records having  aspect bio tagging issues: ', len(df_val[df_val['aspect_check']==False]))

# of df_train records having tokenizing issues:  0
# of df_train records having aspect bio tagging issues:  0
# of df_val records having tokenizing issues:  0
# of df_val records having  aspect bio tagging issues:  0


# EDA

## Sentence length

In [None]:
df_train['sentence_length'] = df_train.apply(lambda x: len(x['sentence_token'].get_tokens()), axis=1)

In [None]:
fig = px.ecdf(df_train['sentence_length'], marginal="histogram", title='90% reviews have <= 30 words')
fig. update_layout(showlegend=False,
                   xaxis_title="# of words in reviews",
                   yaxis_title="Review counts")
fig.show()

# CRF with FastText embeddings

In [None]:
def sentence_embeddings_to_dict (sentence_embeddings, window = 5):
  sentence_features = []
  for i in range(0, len(sentence_embeddings)):
    word_features = {}
    word_embeddings = sentence_embeddings[i]
    word_features.update(word_embeddings_to_dict(word_embeddings,f'word'))
    
    if i > 0:
      for k in range(1, min(window, i)+1):
          # prev_word = tokens[i - k]
          prev_word_embeddings = sentence_embeddings[i-k]
          # print('prev_word: ', prev_word)
          word_features.update(word_embeddings_to_dict(prev_word_embeddings,f'-{k}:word'))
    else:
        word_features['BOS'] = True  # Beginning of sentence

    if i < len(sentence_embeddings) - 1:
      for k in range(1, min(window, len(sentence_embeddings) - i - 1)+1):
        # next_word = tokens[i + k]
        next_word_embeddings = sentence_embeddings[i+k]
        # print('next_word: ', next_word)
        # print(next_word_embeddings)
        word_features.update(word_embeddings_to_dict(next_word_embeddings,f'+{k}:word'))
    else:
        word_features['EOS'] = True  # End of sentence

    sentence_features.append(word_features)
    
  return sentence_features

def word_embeddings_to_dict(embeddings, feature_prefix='word'):
  
  
  word_features = {}
  for iv,value in enumerate(embeddings):
    word_features[f'{feature_prefix}:v_{iv}'] = value
  
  return word_features

In [None]:
fasttext_model = fasttext.load_model('cc.en.300.bin')
X_train_word_embeddings = [[fasttext_model.get_word_vector(token) for token in sentence_token.get_tokens()] for sentence_token in df_train['sentence_token']]

In [None]:
X_train_dict = [sentence_embeddings_to_dict(sentence_embeddings, 5) for sentence_embeddings in X_train_word_embeddings]

In [None]:
y_train = df_train['sentence_token'].apply(lambda x: x.aspect_unified_bio_tags)

In [None]:
X_train_crf, X_test_crf, y_train, y_test = train_test_split(X_train_dict, y_train, test_size=0.2, random_state=42)

In [None]:
# Create and train CRF model
crf_model = CRF(algorithm='lbfgs',
                max_iterations=100,
                c1=0.5,
                c2=0.05)

# There is this error existing with this library: 'CRF' object has no attribute 'keep_tempfiles'
# which has not been resolved and we can bypass it using this trick.
try:
  crf_model.fit(X_train_crf, y_train)
except AttributeError:
  pass

In [None]:
y_train_crf_pred = crf_model.predict(X_train_crf)

y_train_flat = [tag for sentence in y_train for tag in sentence]
y_train_crf_pred_flat = [tag for sentence in y_train_crf_pred for tag in sentence]


# Evaluate the model
accuracy = accuracy_score(y_train_flat, y_train_crf_pred_flat)
classification_rep = classification_report(y_train_flat, y_train_crf_pred_flat)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_train_flat, y_train_crf_pred_flat, labels=crf_model.classes_)

# Plot the confusion matrix with seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=crf_model.classes_, yticklabels=crf_model.classes_, vmax=100, vmin=1)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
y_test_crf_pred = crf_model.predict(X_test_crf)

y_test_flat = [tag for sentence in y_test for tag in sentence]
y_test_crf_pred_flat = [tag for sentence in y_test_crf_pred for tag in sentence]


# Evaluate the model
accuracy = accuracy_score(y_test_flat, y_test_crf_pred_flat)
classification_rep = classification_report(y_test_flat, y_test_crf_pred_flat)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test_flat, y_test_crf_pred_flat, labels=crf_model.classes_)

# Plot the confusion matrix with seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=crf_model.classes_, yticklabels=crf_model.classes_, vmax=100, vmin=1)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
missing_words = set(token  for sentence_token in df_train['sentence_token'] for token in sentence_token.get_tokens() if token not in fasttext_model.words)
len(missing_words)

# Bi-LSTM

In [6]:
import tensorflow as tf
from tf2CRF import CRF
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense
from tensorflow.keras.models import Model
from tf2crf import CRF, ModelWithCRFLoss

inputs = Input(shape=(None,), dtype='int32')
output = Embedding(100, 40, trainable=True, mask_zero=True)(inputs)
output = Bidirectional(GRU(64, return_sequences=True))(output)
crf = CRF(units=9, type='float32')
output = crf(output)
base_model = Model(inputs, output)
model = ModelWithCRFLoss(base_model, sparse_target=True)
model.compile(optimizer='adam')

x = [[5, 2, 3] * 3] * 10
y = [[1, 2, 3] * 3] * 10

model.fit(x=x, y=y, epochs=2, batch_size=2)
model.save('tests/1')

2024-01-03 22:37:22.537588: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'tf2CRF'

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
# fix random seed for reproducibility
tf.random.set_seed(7)

In [None]:
X_words = df_train['sentence_token'].apply(lambda x: x.get_tokens())

In [None]:
# Create a tokenizer
tokenizer = Tokenizer(char_level=False, lower=True)
# Fit on your list of words
tokenizer.fit_on_texts(X_words)

word_to_number = tokenizer.word_index
number_to_word = tokenizer.index_word
fasttext_model = fasttext.load_model('cc.en.300.bin')
word_to_features = dict((word_to_number[token], fasttext_model.get_word_vector(token)) for token in word_to_number.keys())
all_words = list(word_to_number.keys())

In [None]:

y_words = df_train['sentence_token'].apply(lambda x: x.aspect_unified_bio_tags)
# Create a tokenizer
y_tokenizer = Tokenizer(char_level=False, lower=False)
# Fit on your list of words
y_tokenizer.fit_on_texts(y_words)


tag_to_number = y_tokenizer.word_index
number_to_tag = y_tokenizer.index_word

all_tags = list(tag_to_number.keys())

print(f"Tags size: {len(all_tags)}")

y_id = y_tokenizer.texts_to_sequences(y_words)
y = sequence.pad_sequences(y_id, maxlen=max_review_length, padding='post')

In [None]:
X_train_words, X_test_words, y_train, y_test = train_test_split(X_words, y, test_size=0.2, random_state=42)

In [None]:

max_review_length = 30
word_to_features[0] = [0]*300
X_train = tokenizer.texts_to_sequences(X_train_words)
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length, padding='post')
X_train = np.array([[word_to_features[word] for word in sentence] for sentence in X_train])

X_test = tokenizer.texts_to_sequences(X_test_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, padding='post')
X_test = np.array([[word_to_features[word] for word in sentence] for sentence in X_test])

In [None]:
number_of_classes = len(all_words) + 1 #Important adjustment
number_of_tags = len(all_tags) + 1

print(number_of_classes)
RNN_wordlevel = Sequential([
    layers.InputLayer(input_shape=( max_review_length,300,)),
    # # embedding layer, 8-dimensional
    # layers.Embedding(number_of_classes, 8),
    layers.Bidirectional(layers.LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    layers.BatchNormalization(),
    layers.Dropout(0.15),
    # # the intermediate recurrent layers should return full sequences
    # layers.GRU(64, activation='relu', return_sequences=True),
    # layers.BatchNormalization(),
    # layers.Dropout(0.15),

    # # the last recurrent layer only returns the final output
    # layers.GRU(32, activation='relu', return_sequences=False),
    # layers.BatchNormalization(),
    # layers.Dropout(0.15),

    # output layer
    layers.Dense(16, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.15),
    layers.Dense(number_of_tags, activation='softmax')], name="RNN_wordlevel")

In [None]:
# Compile model
RNN_wordlevel.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

In [None]:
history = RNN_wordlevel.fit(X_train, y_train,
        batch_size=1024,
        epochs=25,
        validation_data=(X_test, y_test))

In [None]:
epochs = range(1, 26)

plt.figure(figsize=(20, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, history.history["accuracy"], label="training", marker="o")
plt.plot(epochs, history.history["val_accuracy"], label="validation", marker="o")
plt.xlabel("Epochs")
plt.xticks(epochs[::5])
plt.ylabel("Accuracy")
plt.legend()


plt.subplot(1, 2, 2)
plt.plot(epochs, history.history["loss"], label="training", marker="o")
plt.plot(epochs, history.history["val_loss"], label="validation", marker="o")
plt.xlabel("Epochs")
plt.xticks(epochs[::5])
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# print("Evaluate on test data")
# results = model.evaluate(X_test, y_test, batch_size=128)
# print("test loss: {} ".format(results[0]))
# print("test accuracy: {} ".format(results[1]))

i = np.random.randint(0, X_test.shape[0])
print("This is sentence:",i)
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
number_to_word[0] = ''
number_to_tag[0] = ''

print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(X_test_words.iloc[i], y_test[i], p[0]):
    print("{:15}{}\t{}".format(w, number_to_tag[true], number_to_tag[pred]))


In [None]:
y_test_pred_raw = RNN_wordlevel.predict(X_test)
y_test_pred = np.argmax(y_test_pred_raw, axis=-1)
y_test_flat = [tag for sentence in y_test for tag in sentence]
y_test_pred_flat = [tag for sentence in y_test_pred for tag in sentence]


# Evaluate the model
accuracy = accuracy_score(y_test_flat, y_test_pred_flat)
classification_rep = classification_report(y_test_flat, y_test_pred_flat)


print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test_flat, y_test_pred_flat)
conf_matrix

# Plot the confusion matrix with seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['None'] + list(tag_to_number.keys()), yticklabels=['None']+list(tag_to_number.keys()), vmax=100, vmin=1)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
list(tag_to_number.keys())

# RNN

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

X_train_words = [sentence_token.get_tokens() for sentence_token in df_train['sentence_token']]

# Create a tokenizer
tokenizer = Tokenizer(char_level=False, lower=True)
# Fit on your list of words
tokenizer.fit_on_texts(X_train_words)

word_to_number = tokenizer.word_index
number_to_word = tokenizer.index_word

all_words = list(word_to_number.keys())

print(f"Vocabulary size: {len(all_words)}")

dataset = tokenizer.texts_to_sequences(X_train_words)

print(dataset[0])


In [None]:
word_frequency = pd.DataFrame(data=tokenizer.word_counts.items(), columns=["word", "count"])
word_frequency = word_frequency.sort_values("count", ascending=False)[:25]
word_frequency.set_index("word").plot(kind="bar", rot=90, title="Word count")

In [None]:
y_train_words = df_train['sentence_token'].apply(lambda x: x.aspect_unified_bio_tags)

In [None]:

# Create a tokenizer
y_tokenizer = Tokenizer(char_level=False, lower=False)
# Fit on your list of words
y_tokenizer.fit_on_texts(y_train_words)


tag_to_number = y_tokenizer.word_index
number_to_tag = y_tokenizer.index_word

all_tags = list(tag_to_number.keys())

print(f"Tags size: {len(all_tags)}")

y_train = y_tokenizer.texts_to_sequences(y_train_words)


word_frequency = pd.DataFrame(data=y_tokenizer.word_counts.items(), columns=["word", "count"])
word_frequency = word_frequency.sort_values("count", ascending=False)[:25]
word_frequency.set_index("word").plot(kind="bar", rot=90, title="Word count")

# Simple RNN?!?! From study material

In [None]:
##### THIS NEEDS REVISE LOGIC - CURRENTLY IS JUST TO SAMPLE ########
X = []
y = []
SEQUENCE_LENGTH = 5
for nth_sentence, sentence in enumerate(dataset):
    for window_start_idx in range(len(sentence)-SEQUENCE_LENGTH):
        window_end_idx = window_start_idx + SEQUENCE_LENGTH
        X.append(sentence[window_start_idx: window_end_idx])
        y.append(y_train[nth_sentence][window_start_idx])

X = np.array(X)
y = np.array(y)

# Let's look at the shapes
print(X.shape)
print(y.shape)

In [None]:
for i in range(20):
    print("X:", [number_to_word[num] for num in X[i]])
    print("y:", number_to_tag[y[i]])
    print("*******")

In [None]:
# split train and validation set
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.1)

print(X_train.shape, y_train.shape)
print(X_validation.shape, y_validation.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GRU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model

In [None]:
number_of_classes = len(all_words) + 1 #Important adjustment
number_of_tags = len(all_tags) + 1
print(number_of_classes)
RNN_wordlevel = Sequential([

    # embedding layer, 8-dimensional
    Embedding(number_of_classes, 8),

    # the intermediate recurrent layers should return full sequences
    GRU(64, activation='relu', return_sequences=True),
    BatchNormalization(),
    Dropout(0.15),

    # the last recurrent layer only returns the final output
    GRU(32, activation='relu', return_sequences=False),
    BatchNormalization(),
    Dropout(0.15),

    # output layer
    Dense(16, activation='relu'),
    BatchNormalization(),
    Dropout(0.15),
    Dense(number_of_tags, activation='softmax')], name="RNN_wordlevel")

In [None]:
# Compile model
RNN_wordlevel.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

In [None]:
history = RNN_wordlevel.fit(X_train, y_train,
        batch_size=1024,
        epochs=25,
        validation_data=(X_validation, y_validation))

In [None]:
epochs = range(1, 26)

plt.figure(figsize=(20, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, history.history["accuracy"], label="training", marker="o")
plt.plot(epochs, history.history["val_accuracy"], label="validation", marker="o")
plt.xlabel("Epochs")
plt.xticks(epochs[::5])
plt.ylabel("Accuracy")
plt.legend()


plt.subplot(1, 2, 2)
plt.plot(epochs, history.history["loss"], label="training", marker="o")
plt.plot(epochs, history.history["val_loss"], label="validation", marker="o")
plt.xlabel("Epochs")
plt.xticks(epochs[::5])
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
train_loss, train_acc = RNN_wordlevel.evaluate(X_train,  y_train, verbose=2)
val_loss, val_acc = RNN_wordlevel.evaluate(X_validation,  y_validation, verbose=2)
print('\nTrain accuracy:', train_acc)
print('\nVal accuracy:', val_acc)

In [None]:
df_train['text'][0]

In [None]:
input_phrase = ['I', 'charge', 'it', 'at', 'night', 'and', 'skip', 'taking', 'the', 'cord', 'with', 'me', 'because', 'of', 'the'] # good battery life.'

# process for the model
processed_phrase = tokenizer.texts_to_sequences([input_phrase])[0]


# extract last 5 words
network_input = np.array(processed_phrase[-SEQUENCE_LENGTH:], dtype=np.float32)
network_input = network_input.reshape((1, SEQUENCE_LENGTH)) # shape: 1 x 5

# the RNN gives the probability of each word as the next one
predict_proba = RNN_wordlevel.predict(network_input, verbose=0)[0] # shape (4855,)
predict_label = number_to_tag[np.argmax(predict_proba)]
# # sample one word using these chances
# predicted_index = np.random.choice(number_of_classes, 1, p=predict_proba)[0]

# # add new index at the end of our list
# processed_phrase.append(predicted_index)

# # progress indicator
# print(i, end="\r")

# indices mapped to words - the method expects a list of lists so we need the extra bracket
output_phrase = tokenizer.sequences_to_texts([processed_phrase])[0]

print(output_phrase)
print(predict_label)

In [None]:
# def plot_image(i, predictions_array, true_label, img):
#   # true_label, img = true_label[i], img[i]
#   # plt.grid(False)
#   # plt.xticks([])
#   # plt.yticks([])

#   # plt.imshow(img, cmap=plt.cm.binary)

#   predicted_label = np.argmax(predictions_array)
#   if predicted_label == true_label:
#     color = 'blue'
#   else:
#     color = 'red'

#   plt.xlabel("{} {:2.0f}% ({})".format(number_to_tag[predicted_label],
#                                 100*np.max(predictions_array),
#                                 number_to_tag[true_label]),
#                                 color=color)



In [None]:
# Evaluate the model's performance on the test data
score = RNN_wordlevel.evaluate(X_validation, y_validation, verbose=1)

print('Test loss:', score[0])
print('Test accuracy:', score[1])