# Data Processing

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.request

# Downloading nltk stopwords
nltk.download('stopwords')

# Set of stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
import pandas as pd

# Load the dataset from the local CSV file
try:
    df = pd.read_csv('/content/yelp.csv')
    print("Dataset loaded successfully from /content/yelp.csv:")
    display(df.head())
except FileNotFoundError:
    print("Error: /content/yelp.csv not found. Please make sure the file exists at this location.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

Dataset loaded successfully from /content/yelp.csv:


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [3]:
df.shape

(10000, 10)

### Text Preprocessing

In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import urllib.request

# Downloading nltk stopwords
nltk.download('stopwords')

# Set of stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess a single review:
    - Lowercasing
    - Removing links
    - Removing punctuation
    - Removing non-alphanumerical characters
    - Removing stopwords
    """
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove links
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\W*\b\w{1,2}\b', '', text)  # Remove words with 1 or 2 letters
    text = re.sub(r"[^a-zA-Z.,&!?]+", r" ", text) # Ensure only alphanumerical characters
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Load the Yelp dataset
yelp_data = df

# Preprocess the reviews
yelp_data['processed_reviews'] = yelp_data['text'].apply(preprocess_text)

yelp_data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,processed_reviews
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0,wife took birthday breakfast excellent weather...
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0,idea people give bad reviews place goes show p...
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0,love gyro plate rice good also dig candy selec...
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0,rosie dakota love chaparral dog park convenien...
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0,general manager scott petello good egg detail ...
...,...,...,...,...,...,...,...,...,...,...,...
9995,VY_tvNUCCXGXQeSvJl757Q,2012-07-28,Ubyfp2RSDYW0g7Mbr8N3iA,3,First visit...Had lunch here today - used my G...,review,_eqQoPtQ3e3UxLE4faT6ow,1,2,0,first visithad lunch today used groupon ordere...
9996,EKzMHI1tip8rC1-ZAy64yg,2012-01-18,2XyIOQKbVFb6uXQdJ0RzlQ,4,Should be called house of deliciousness!\n\nI ...,review,ROru4uk5SaYc3rg8IU7SQw,0,0,0,called house deliciousness could item item bla...
9997,53YGfwmbW73JhFiemNeyzQ,2010-11-16,jyznYkIbpqVmlsZxSDSypA,4,I recently visited Olive and Ivy for business ...,review,gGbN1aKQHMgfQZkqlsuwzg,0,0,0,recently visited olive ivy business last week ...
9998,9SKdOoDHcFoxK5ZtsgHJoA,2012-12-02,5UKq9WQE1qQbJ0DJbc-B6Q,2,My nephew just moved to Scottsdale recently so...,review,0lyVoNazXa20WzUyZPLaQQ,0,0,0,nephew moved scottsdale recently bunch friends...


In [7]:

# Tokenizing the reviews
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(yelp_data['processed_reviews'])

# Get word frequencies
word_freq = tokenizer.word_counts

# Create a new tokenizer with words that have a minimum frequency of 5
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.word_index = {word: index for word, index in tokenizer.word_index.items() if word_freq[word] >= 5}

# Re-fit the tokenizer to the reviews
tokenizer.fit_on_texts(yelp_data['processed_reviews'])

# Converting reviews to sequences of IDs
sequences = tokenizer.texts_to_sequences(yelp_data['processed_reviews'])

# Filter out empty sequences
sequences = [seq for seq in sequences if seq]

# Padding sequences to ensure equal length
padded_sequences = pad_sequences(sequences, padding='post')