In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
url = 'https://drive.google.com/uc?export=download&id=1ru8jlaqGYD7C_dZnAelqWHODFHYeA7y3'
dataset = tf.keras.utils.get_file("balanced_dataset.csv", url,
                                  cache_dir='data',
                                  cache_subdir='')

In [None]:
df = pd.read_csv(dataset)
# shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)
# write to csv

In [None]:
df.to_csv('data/balanced_dataset_shuffle.csv', index=False)

In [None]:
dataset = "data\\balanced_dataset_shuffle.csv"

In [None]:
# load tensorflow dataset
ds = tf.data.experimental.make_csv_dataset(
    dataset,
    batch_size=64,
    label_name='overall',
    num_epochs=1,
    shuffle=True,
)
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds = ds.cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
train_ds = ds.take(5000)
val_ds = ds.skip(5000).take(500)
test_ds = ds.skip(5500).take(500)

In [None]:
VOCAB_SIZE = 50000
SEQUENCE_LENGTH = 500
# text_vectorizer = tf.keras.layers.TextVectorization(
#     max_tokens=VOCAB_SIZE,
#     output_mode='count',
# )
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH,
)

In [None]:
train_text = train_ds.map(lambda x, y: x['reviewText'])
text_vectorizer.adapt(train_text)

In [None]:
classes = [1,2,3,4,5]
table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(tf.constant(classes), tf.range(len(classes))), 
    default_value=-1) 
label_encoder = tf.keras.layers.Lambda(lambda x: table.lookup(x))
train_ds = train_ds.map(lambda x, y: (x, label_encoder(y)))
val_ds = val_ds.map(lambda x, y: (x, label_encoder(y)))
test_ds = test_ds.map(lambda x, y: (x, label_encoder(y)))

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    text_vectorizer,
    tf.keras.layers.Embedding(VOCAB_SIZE + 1, 64),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax'),
    # tf.keras.layers.Dense(5, activation='relu'),
    # tf.keras.layers.Dense(1)
])
model.compile(optimizer=tf.keras.optimizers.Adam(),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                metrics=['accuracy','mae'])


In [None]:
# visual distrobution of labels in training data
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.set(font_scale=1.5)
sns.set_palette('bright')
plt.figure(figsize=(10, 8))
train_labels = np.concatenate([y for x, y in train_ds], axis=0)
sns.countplot(x=train_labels)
plt.xlabel('Review Rating')
plt.show()


In [None]:
history = model.fit(train_ds, epochs=10, validation_data=val_ds, callbacks=[callback])

In [None]:
results = model.evaluate(test_ds)

In [None]:
res = model.predict([
    "This is a great product. I love it. I would recommend it to anyone.",
    "This is a bad product. I hate it. I would not recommend it to anyone.",
    "This is decent product. I would recommend it to some people.",
])
print(np.argmax(res, axis=1))
print(res)

In [None]:
model.summary()