<a href="https://www.kaggle.com/code/yacharki/training-testing-the-yelp-model?scriptVersionId=191565979" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Downloads
!pip install contractions
!pip install textsearch
!pip install tqdm

import nltk
nltk.download('punkt')

# Fundamental classes
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

# Time
import time
import datetime

# Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
import contractions
from bs4 import BeautifulSoup
import re
import tqdm
import unicodedata

seed = 3541
np.random.seed(seed)

In [None]:
# Define a dummy loss to bypass the error during model loading
def dummy_loss(y_true, y_pred):
    return tf.reduce_mean(y_pred - y_true)

# Loading the model Trained on Yelp reviews
modelYelp = keras.models.load_model(
    '/kaggle/input/pre-trained-model-binary-cnn-nlp-yelpreviews/tensorflow1/pre-trained-model-binary-cnn-nlp-yelp-reviews/1/Binary_Classification_90_Yelp_Reviews_CNN.h5',
    compile=False
)

# Compile the model with the correct loss function and reduction
modelYelp.compile(
    optimizer='adam',
    loss=keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
    metrics=['accuracy']
)

# Loading Yelp test data
dataset_test_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/test.csv')

# Loading Yelp train data (to be used on the label encoder)
dataset_train_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/train.csv')

# Shuffling the Test Data
test_Yelp = dataset_test_Yelp.sample(frac=1)
train_Yelp = dataset_train_Yelp.sample(frac=1)

# Taking a tiny portion of the database (because it will only be used on the label encoder)
train_Yelp = dataset_train_Yelp.iloc[:100, :]

# Taking only necessary columns
y_test_Yelp = test_Yelp['class_index'].values
X_train_Yelp = train_Yelp['review_text'].values
y_train_Yelp = train_Yelp['class_index'].values

# Preprocess corpus function
def pre_process_corpus(corpus):
    processed_corpus = []
    for doc in tqdm.tqdm(corpus):
        doc = contractions.fix(doc)
        doc = BeautifulSoup(doc, "html.parser").get_text()
        doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
        doc = doc.lower()
        doc = doc.strip()
        processed_corpus.append(doc)
    return processed_corpus

# Preprocessing the Data
X_test_Yelp = pre_process_corpus(test_Yelp['review_text'].values)
X_train_Yelp = pre_process_corpus(X_train_Yelp)

# Creating and Fitting the Tokenizer
t = Tokenizer(oov_token='<UNK>')
t.fit_on_texts(X_train_Yelp)
t.word_index['<PAD>'] = 0

# Transforming text to sequences 
X_test_Yelp = t.texts_to_sequences(X_test_Yelp)
X_train_Yelp = t.texts_to_sequences(X_train_Yelp)

# Padding the transformed text (sentences) to maximum length of 220
X_test_Yelp = sequence.pad_sequences(X_test_Yelp, maxlen=220)
X_train_Yelp = sequence.pad_sequences(X_train_Yelp, maxlen=220)

# Creating and Fitting the label encoder
le = LabelEncoder()
num_classes = 2  # positive -> 1, negative -> 0
y_train_Yelp = le.fit_transform(y_train_Yelp)

# Transforming the labels
y_test_Yelp = le.transform(y_test_Yelp)

In [None]:
# Evaluating the models

# Testing Yelp Classifier on Yelp Test Data
print(" Testing Yelp Classifier on Yelp Test Data")
scores = modelYelp.evaluate(X_test_Yelp, y_test_Yelp, verbose=1)
print("Accuracy: %.2f%% /n" % (scores[1] * 100))