In [1]:
import pandas as pd
import numpy as np
import json
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load Yelp reviews data
reviews = []
with open('yelp_academic_dataset_review.json', encoding='utf-8') as f:
    for line in f:
        review = json.loads(line)
        sentiment = 1 if review['stars'] >= 4 else 0
        reviews.append((review['text'], sentiment))

df = pd.DataFrame(reviews, columns=['text', 'sentiment'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eliej\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eliej\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
df.head()

Unnamed: 0,text,sentiment
0,"If you decide to eat here, just be aware it is...",0
1,I've taken a lot of spin classes over the year...,1
2,Family diner. Had the buffet. Eclectic assortm...,0
3,"Wow! Yummy, different, delicious. Our favo...",1
4,Cute interior and owner (?) gave us tour of up...,1


In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # Remove random characters
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (IOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub("", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenise
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    filtered_tokens = [w for w in tokens if not w in stop_words and w.isalpha()]
    return " ".join(filtered_tokens)

In [5]:
df['processed_text'] = df['text'].apply(preprocess_text)
df.head(2)

Unnamed: 0,text,sentiment,processed_text
0,"If you decide to eat here, just be aware it is...",0,decide eat aware going take hours beginning en...
1,I've taken a lot of spin classes over the year...,1,ive taken lot spin classes years nothing compa...


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['sentiment'], test_size=0.2, random_state=42)

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=200)

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)




In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

model = Sequential()
model.add(Embedding(5000, 16, input_length=200))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 16)           80000     
                                                                 
 gru (GRU)                   (None, 32)                4800      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 84833 (331.38 KB)
Trainable params: 84833 (331.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
history = model.fit(X_train_pad, y_train, epochs=5, validation_data=(X_test_pad, y_test), batch_size=64)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print('Test accuracy:', accuracy)

Test accuracy: 0.9320864081382751


In [16]:
def preprocess_and_predict(text):
    # Preprocess the text
    processed_text = preprocess_text(text)

    # Convert to a sequence
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=200)

    # Predict
    prediction = model.predict(padded_sequence)

    return 'Positive' if prediction[0][0] > 0.5 else 'Negative'

In [17]:
# Test with a positive review
input_text = "The food was absolutely wonderful, from preparation to presentation, very pleasing."
predicted_sentiment = preprocess_and_predict(input_text)
print(f'Review is: {predicted_sentiment}')

# Test with a negative review
negative_input_text = "Unfortunately, the experience was disappointing. The service was slow and the food was bland. Definitely not worth the price."
negative_predicted_sentiment = preprocess_and_predict(negative_input_text)
print(f'Review is: {negative_predicted_sentiment}')

Review is: Positive
Review is: Negative


In [15]:
model.save('yelp_sentiment_model.tf')

INFO:tensorflow:Assets written to: yelp_sentiment_model.tf\assets


INFO:tensorflow:Assets written to: yelp_sentiment_model.tf\assets
