In [1]:
import pandas as pd

dataset_path = 'reviews.csv'
df = pd.read_csv(dataset_path)


In [2]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
!pip show tensorflow pandas scikit-learn matplotlib

Name: tensorflow
Version: 2.15.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: C:\Users\TECHMALANG\anaconda3\Lib\site-packages
Requires: tensorflow-intel
Required-by: 
---
Name: pandas
Version: 2.1.4
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.

Copyright (c) 2011-2023, Open source contributors.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditi

In [5]:
import tensorflow as tf
print(tf.__version__)



2.15.0


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Text'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df['Text'])

# Pad the sequences to ensure uniform length
max_length = max(len(x) for x in sequences)  # You might want to set a specific max_length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


In [7]:
import numpy as np

# Convert scores to binary labels
df['Sentiment'] = np.where(df['Score'] > 3, 1, 0)  # 1 for positive, 0 for negative
labels = df['Sentiment'].values


In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [9]:
import numpy as np
import tensorflow as tf

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # Apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # Apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)


In [10]:
from tensorflow.keras.layers import Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Sequential

# Model hyperparameters (you may experiment with these)
vocab_size = 10000
max_length = 128  # Adjust based on your padding length
d_model = 64  # The dimensionality of the embedding/output vectors

model = Sequential([
    # Embedding layer
    Embedding(vocab_size, d_model, input_length=max_length),
    # Positional Encoding (addition operation)
    # Note: TensorFlow doesn't have a built-in layer for positional encoding; you might need to add it manually or customize a layer.
    # Simplified for demonstration: we'll skip adding it directly in this code snippet.
    
    # Here you would include the attention and feedforward layers, which are omitted for simplicity
    
    # Output layer
    Dense(1, activation='sigmoid')  # Assuming binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])






In [11]:
class PositionalEncodingLayer(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncodingLayer, self).__init__()
        self.pos_encoding = positional_encoding(position, d_model)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

position = max_length  # This should match your input sequence length
d_model = 64  # Dimensionality of the embedding/output vectors


In [12]:
from tensorflow.keras.layers import Input, Embedding, Dense, Attention, GlobalAveragePooling1D, Dropout
from tensorflow.keras.models import Model

# Inputs
inputs = Input(shape=(max_length,))

# Embeddings
embedding_layer = Embedding(vocab_size, d_model)(inputs)

# Positional Encoding
pos_encoding_layer = PositionalEncodingLayer(position=max_length, d_model=d_model)(embedding_layer)

# Self-Attention (simplified)
attention_output = Attention()([pos_encoding_layer, pos_encoding_layer])

# Pooling + Output
x = GlobalAveragePooling1D()(attention_output)
x = Dropout(0.1)(x)
outputs = Dense(1, activation='sigmoid')(x)

# Model
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 128, 64)              640000    ['input_1[0][0]']             
                                                                                                  
 positional_encoding_layer   (None, 128, 64)              0         ['embedding_1[0][0]']         
 (PositionalEncodingLayer)                                                                        
                                                                                                  
 attention (Attention)       (None, 128, 64)              0         ['positional_encoding_laye

In [14]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


MemoryError: Unable to allocate 5.94 GiB for an array with shape (454763, 3507) and data type int32

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define a sensible maximum sequence length
max_length = 128

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'labels' contains your binary sentiment labels
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [None]:
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D, Dropout
from tensorflow.keras.models import Model

# Inputs
inputs = Input(shape=(max_length,))

# Embeddings
embedding_layer = Embedding(vocab_size, d_model, input_length=max_length)(inputs)

# Add any additional layers you might have defined earlier
# For example, Positional Encoding and Attention layers (omitted here for brevity)

# Output layer
x = GlobalAveragePooling1D()(embedding_layer)  # Adjusted for simplicity; replace with your actual layers
x = Dropout(0.1)(x)
outputs = Dense(1, activation='sigmoid')(x)

# Model compilation
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


In [None]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


In [None]:
from tensorflow.keras.layers import MultiHeadAttention

# Assuming you have defined `d_model` and `num_heads`
num_heads = 8  # Number of attention heads

# In your model architecture
multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
attention_output = multi_head_attention(query=embedding_layer, value=embedding_layer, key=embedding_layer)


In [None]:
from tensorflow.keras.layers import Layer, Dense, LayerNormalization
from tensorflow.keras.models import Sequential

class TransformerEncoderLayer(Layer):
    def __init__(self, d_model, num_heads):
        super(TransformerEncoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dense_proj = Sequential([Dense(d_model, activation='relu'), Dense(d_model)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs):
        attn_output = self.multi_head_attention(inputs, inputs, inputs)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.dense_proj(out1)
        return self.layernorm2(out1 + ffn_output)

# Example usage
num_encoder_layers = 4  # Number of encoder layers

# Initialize a list to hold the encoder layers
encoder_layers = [TransformerEncoderLayer(d_model=d_model, num_heads=num_heads) for _ in range(num_encoder_layers)]

# In your model definition, add each encoder layer
x = embedding_layer  # Assuming `embedding_layer` is defined
for encoder_layer in encoder_layers:
    x = encoder_layer(x)


In [None]:
new_reviews = ["This product was great, I loved it!", "Not what I expected, quite disappointing."]


In [None]:
# Tokenize the new reviews
new_sequences = tokenizer.texts_to_sequences(new_reviews)

# Pad the new sequences
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length, padding='post', truncating='post')


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Assuming 'df' is your DataFrame and 'Text' is the column containing text data
texts = df['Text'].astype(str).tolist()  # Convert the text column to a list of strings

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)  # Fit the tokenizer on the texts


In [None]:
predictions = model.predict(new_padded_sequences)
