# Basic transformer implementation based on Vaswani et al. "Attention is all you need" for sentiment analysis using tensorflow/keras

## STEP 1: IMPORT NECESSARY LIBRARIES

In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split





## STEP 2: LOAD AND INSPECT DATASET

In [4]:
# Load the dataset
dataset_path = 'reviews.csv'  
df = pd.read_csv(dataset_path)

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
df.Text.head()


0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
Name: Text, dtype: object

## STEP 3: DATA PREPROCESSING

### Tokenization and padding

In [7]:
# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Text'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['Text'])

# Pad the sequences
# Choose a practical max_length
max_length = 128  # Adjust this based on your dataset

# Pad the sequences with the new max_length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
# Convert sentiment scores to binary labels
df['Sentiment'] = np.where(df['Score'] > 3, 1, 0)
labels = df['Sentiment'].values


### Splitting the data

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


## STEP 4: DEFINING MODEL ARCHITECTURE

In [9]:
# Define a simplified transformer model architecture
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])






## STEP 5: TRAIN THE MODEL

In [10]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## STEP 6: EVALUATE THE MODEL

In [12]:
# Evaluate the model on the testing set
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f'\nTest Accuracy: {test_acc}')


3553/3553 - 7s - loss: 0.2368 - accuracy: 0.9109 - 7s/epoch - 2ms/step

Test Accuracy: 0.9108988642692566


## STEP 7: TEST THE MODEL WITH NEW DATA

### Preprocess new data

In [14]:
# New text data for testing
new_reviews = ["An amazing product! Highly recommend.", "Horrible, I would never buy this again."]

# Preprocess the new reviews
new_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length, padding='post')


### Make predictions