In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Installing weights and bias
!pip install -qqq wandb datasets transformers

In [None]:
#import required libraries
import torch 
import wandb
import sys
from pathlib import Path
from datasets import load_dataset, Dataset
from torch import tensor,nn,device,cuda
from transformers import AutoTokenizer, TrainingArguments, Trainer,AutoModelForSequenceClassification,DataCollatorWithPadding
from transformers.trainer_callback import EarlyStoppingCallback,TrainerCallback
from huggingface_hub import HfFolder
from datasets import load_metric
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset,Dataset,load_metric
import tensorflow as tf

Log in to huggingface and w&b

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import login

login()

In [None]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

my_secret = user_secrets.get_secret("wandb_api_key") 

import wandb
wandb.login(key=my_secret)

Set up weight and bias parameters


In [None]:
PROJECT_NAME="restaurant_review"
DATASET_TYPE='dataset'
MODEL_TYPE='model'

#wandb job types
RAW_DATA_JOB_TYPE='fetch_raw_data'
DATA_PROCESSING_JOB_TYPE='preocess-data'
SPLIT_DATA_JOB_TYPE='split-data'
MODEL_TRAINING_JOB_TYPE='model-training'
MODEL_INFERENCE_JOB_TYPE='model-inference'

#Wandb artifact names
RAW_DATA_ARTIFACT='restaurant_raw_data'
PROCESSED_DATA_ARTIFACT='processed_data'
TRAIN_DATA_ARTIFACT='restaurant_train_data'
TEST_DATA_ARTIFACT='restaurant_test_data'

#data folders
RAW_DATA_FOLDER='restaurent-dataset/raw'
PROCESSED_DATA_FOLDER='restaurant-dataset/processed'
TRAIN_DATA_FOLDER='restaurant-dataset/train'
TEST_DATA_FOLDER='restaurant-dataset/test'
MODEL_DATA_FOLDER='restaurant-dataset/model'

# TRANSFORMERS PARAMETERS
MODEL_NAME = "distilbert-base-uncased"
NUM_EPOCHS = 3
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 32
WARMUP_STEPS = 500
LEARNING_RATE = 5e-5
FP16 = True
# HUB PARAMETERS
PUSH_TO_HUB = True
HUB_MODEL_ID = "distilbert-complaints-wandb-product"
HUB_STRATEGY = "every_save"

Download and log rawdata

In [None]:
run=wandb.init(project=PROJECT_NAME,job_type='Baseline modeling',save_code=True)


In [None]:
original_data='/kaggle/input/restaurant-reviews/Restaurant reviews.csv'


#text_dataset=load_dataset(original_data)

EDA


In [None]:
#Read in data
rest=pd.read_csv(original_data)

rest.head()

In [None]:
print(rest.info())
rest.columns

In [None]:
rest.shape

In [None]:
rest['Rating'].value_counts()

In [None]:
rest=rest.drop(rest[rest['Rating']=='Like'].index)
#df = df.drop(df[(df.score < 50) & (df.score > 20)].index)
rest.shape

In [None]:
rest['Rating']=pd.to_numeric(rest['Rating'])
rest.columns

In [None]:
# Round ratings to nearest digit.
rest['Rating'].replace(1.5,2,inplace=True)
rest['Rating'].replace(2.5,3,inplace=True)
rest["Rating"].replace(3.5,4,inplace=True)
rest["Rating"].replace(4.5,5,inplace=True)

In [None]:
rest['Rating'].value_counts()

In [None]:
#check all ratings
#It seems its an imbalanced dataset with max records as 5 ratings.
import matplotlib.pyplot as plt
rest['Rating'].astype(float).hist()
plt.show()

In [None]:
# Create and log original dataset
raw_artifact=wandb.Artifact(RAW_DATA_ARTIFACT,type=DATASET_TYPE)
#raw_artifact.add_dir(RAW_DATA_FOLDER)
run.log_artifact(raw_artifact)
run.finish()

In [None]:
#drop 7514 column
rest.drop(['7514'],axis=1,inplace=True)
rest.shape

Log data as Table in WANDB

In [None]:
wandb.init(project=PROJECT_NAME,name="tables")

wandb.log({"table":wandb.Table(data=rest,columns=['Restaurant','Review','Rating'])})
wandb.finish()

Process and log data


In [None]:
run=wandb.init(project=PROJECT_NAME,job_type=DATA_PROCESSING_JOB_TYPE)

#by using use_artifact we are logging to w*b and can track it as part of the lineage
run.use_artifact(f'{RAW_DATA_ARTIFACT}:latest')



In [None]:
#Check for any missing values 
rest[rest.isna().any(axis=1)].head()


In [None]:
rest.isna().sum()

In [None]:
#It seems there are many missing values in Review columns ,which will not help in modeling 
# LEts remove missing reviews
rest['Review'].dropna(inplace=True)
rest.shape
rest=rest.drop(rest[rest['Review'].isna()].index)
rest.shape

In [None]:
rest.isna().sum()

In [None]:
#Check duplicate values
rest['Review'].duplicated().sum()

In [None]:
#drop duplicates in review columns
rest=rest.drop_duplicates(['Restaurant','Review'],keep='last')
rest.shape

In [None]:
# Splitting the dataset into training and validation set
cols=['Review' ,'Restaurant']
X=rest['Review']
y=rest['Rating']
#X.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
X.head()

#There may be special characters and emojis in review.

#Lets check if there are any emojis in the text.

#Lets check in ChatGPT if we find any validated code to check emojis and to remove that.

I have passed following prompt to ChatGPT.

"You are an expert data scientist. Please solve my problem.
I have a pandas series with restaurant reviews. Some reviews contain emojis. I want to remove print reviews containing emojis.
Show me python code or pandas code to achieve this."

I got following code. Lets check code.

In [None]:
import pandas as pd
import re

# Function to check if a string contains emojis
def contains_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return bool(emoji_pattern.search(text))

# Filter reviews containing emojis
X_emojis = X[X.apply(contains_emoji)]



In [None]:
X_emojis.head()

In [None]:
#Lets check first example to see which emojis are there.
X_emojis[21]

#Lets check in ChatGPT if we find any validated code to remove that.

I have passed following prompt to ChatGPT.

"You are an expert data scientist. Please solve my problem.
I have a pandas series with restaurent reviews. Some reviews contain emojis. I want to remove emojis from each text that contains it.
Sometimes its attached with word so it becomes part of a word. In such cases ,I want to  remove only emojis and not the whole word.
Show me python code or pandas code to achieve this."

I got following code. Lets check code.

In [None]:

# Function to remove emojis from a text while preserving attached words
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the function to remove emojis from restaurant reviews
X_e = X.apply(remove_emojis)

# Print the reviews without emojis
print(X_e)

In [None]:
#Check same example again.
X_e[21]

In [None]:
X_e.shape

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_e,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_train.head())
print(y_train.head())

In [None]:
#Divide training data to further training and val sets . USe test data for prediction
X_tr,X_val,y_tr,y_val=train_test_split(X_train,y_train,test_size=0.2,random_state=42,stratify=y_train)

In [None]:
# Set hyperparameters for baseline model
vocab_size=10000
embedding_dim=16
max_length=120
trunc_type='post'
padding_type='post'
oov_tok='<OOV>'



In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#Get tokenizer
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(X_tr)
word_index=tokenizer.word_index

training_sequences=tokenizer.texts_to_sequences(X_tr)
val_sequences=tokenizer.texts_to_sequences(X_val)
train_padded=pad_sequences(training_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)
val_padded=pad_sequences(val_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)

In [None]:
print(train_padded[1])
print(val_padded[1])


In [None]:
testing_sequences=tokenizer.texts_to_sequences(X_test)
testing_padded=pad_sequences(testing_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)

In [None]:
print(testing_padded[1])


In [None]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(6,activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
num_epochs=10

training_padded=np.array(train_padded)
training_labels=np.array(y_train)
testing_padded=np.array(testing_padded)
testing_labels=np.array(y_test)

history=model.fit(training_padded,
                  y_tr,
                  epochs=num_epochs,
                  validation_data=(val_padded, y_val),
                  verbose=1)

Plot accuracy and losses

In [None]:
import matplotlib.pyplot as plt
def plot_graphs(history,string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend(string,'val_'+string)
    plt.show()

plot_graphs(history,'accuracy')
plot_graphs(history,'loss')