In [2]:
import pandas as pd
import numpy as np
import json
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load Yelp reviews data
reviews = []
with open('yelp_academic_dataset_review.json', encoding='utf-8') as f:
    for line in f:
        review = json.loads(line)
        sentiment = 1 if review['stars'] >= 4 else 0
        reviews.append((review['text'], sentiment))

df = pd.DataFrame(reviews, columns=['text', 'sentiment'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df.head()

Unnamed: 0,text,sentiment
0,My wife took me here on my birthday for breakf...,1
1,I have no idea why some people give bad review...,1
2,love the gyro plate. Rice is so good and I als...,1
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1
4,General Manager Scott Petello is a good egg!!!...,1


In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # Remove random characters
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (IOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub("", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Tokenise
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    filtered_tokens = [w for w in tokens if not w in stop_words and w.isalpha()]

    # Do stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return " ".join(stemmed_tokens)

In [5]:
df['processed_text'] = df['text'].apply(preprocess_text)
df.head(2)

Unnamed: 0,text,sentiment,processed_text
0,My wife took me here on my birthday for breakf...,1,wife took birthday breakfast excel weather per...
1,I have no idea why some people give bad review...,1,idea peopl give bad review place goe show plea...


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['sentiment'], test_size=0.2, random_state=42)

In [7]:
from collections import Counter

def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

counter = counter_word(df.processed_text)
num_words = len(counter)
print(num_words)
counter

173921


Counter({'wife': 8814,
         'took': 18854,
         'birthday': 5052,
         'breakfast': 17945,
         'excel': 15995,
         'weather': 2485,
         'perfect': 17413,
         'made': 28697,
         'sit': 16361,
         'outsid': 13946,
         'overlook': 904,
         'ground': 2591,
         'absolut': 8782,
         'pleasur': 1337,
         'waitress': 11253,
         'food': 142972,
         'arriv': 10080,
         'quickli': 6150,
         'semibusi': 4,
         'saturday': 7638,
         'morn': 7232,
         'look': 51420,
         'like': 128345,
         'place': 170500,
         'fill': 10687,
         'pretti': 40076,
         'earlier': 1626,
         'get': 107898,
         'better': 33294,
         'favor': 1422,
         'bloodi': 1144,
         'mari': 1428,
         'phenomen': 1437,
         'simpli': 4155,
         'best': 44819,
         'ive': 47278,
         'ever': 25008,
         'im': 53424,
         'sure': 26973,
         'use': 27637,


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df.processed_text)
vocab_length = len(tokenizer.word_index)+1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=200)

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

In [9]:
import gensim.downloader as api
#pretrained_embedding_model = api.load("fasttext-wiki-news-subwords-300")
#pretrained_embedding_model = api.load("glove-wiki-gigaword-100")
pretrained_embedding_model = api.load("word2vec-google-news-300")



In [10]:
embedding_dim = pretrained_embedding_model.vector_size

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in pretrained_embedding_model:
        embedding_matrix[i] = pretrained_embedding_model[word]

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

model = Sequential()

model.add(Embedding(input_dim=vocab_length,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=200,
                    trainable=False))
model.add(GRU(50))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [12]:
history = model.fit(X_train_pad, y_train, epochs=5, validation_data=(X_test_pad, y_test), batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print('Test accuracy:', accuracy)

Test accuracy: 0.843960702419281


In [14]:
def preprocess_and_predict(text):
    # Preprocess the text
    processed_text = preprocess_text(text)

    # Convert to a sequence
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=200)

    # Predict
    prediction = model.predict(padded_sequence)

    return 'Positive' if prediction[0][0] > 0.5 else 'Negative'

In [15]:
# Test with a positive review
input_text = "The food was absolutely wonderful, from preparation to presentation, very pleasing."
predicted_sentiment = preprocess_and_predict(input_text)
print(f'Review is: {predicted_sentiment}')

# Test with a negative review
negative_input_text = "Unfortunately, the experience was disappointing. The service was slow and the food was bland. Definitely not worth the price."
negative_predicted_sentiment = preprocess_and_predict(negative_input_text)
print(f'Review is: {negative_predicted_sentiment}')

Review is: Positive
Review is: Negative
