# Workflow
1. [Data Content](#1)
2. [Import Library](#2)
3. [Data Preprocessing](#3)
4. [Data Visualization](#4)
5. [Train Test Validation Split](#5)
6. [Model Train](#6)
7. [Results Visualization](#7)

<a id='1'></a>
# Data Content

1. **Predict Review Rating**

1. **Topic Modeling on Reviews**

<a id='2'></a>
# Import Library

In [None]:
import numpy as np 
import pandas as pd 
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import emoji
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings('ignore')


<a id='3'></a>
# Data Preprocessing

# the dataset has been linked to the kaggle data can be found in the link below: 

/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv

# Another Link is: 

https://blog.devgenius.io/nlp-topic-modeling-lda-latent-dirichlet-allocation-f87679750e34

In [None]:
df=pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.head()

In [None]:
df.Rating.nunique()

In [None]:
def label_encode(x):
    if x == 1 or x == 2:
        return 0
    if x == 3:
        return 1
    if x == 5 or x == 4:
        return 2
    
def label2name(x):
    if x == 0:
        return "Negative"
    if x == 1:
        return "Neutral"
    if x == 2:
        return "Positive"

In [None]:
df.Rating.values

In [None]:
df["label"] = df["Rating"].apply(lambda x: label_encode(x))
df["label_name"] = df["label"].apply(lambda x: label2name(x))
df.head()

In [None]:
def clean_text(text):
    text=text.lower()
    text=re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text=emoji.demojize(text)
    stop_words=set(stopwords.words("english"))
    
    text = text.split()
    text=(word for word in text if word not in stop_words)
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_words=[lemmatizer.lemmatize(word) for word in text]
    
    clean_words=" ".join(lemmatized_words)
    
    
    return clean_words
    
    

In [None]:
df["lemmatize_text"]=df.Review.apply(lambda x: clean_text(x))
df.head()

In [None]:
df["text_lenght"]=df["Review"].agg(len)
df.head()

<a id='4'></a>
# Data Visualization

In [None]:
from collections import Counter
most_word=" ".join(df["Review"].values).split()
word_counts=Counter(most_word)
word_counts.most_common()
most_common_words_list = [{'word': word, 'count': count} for word, count in word_counts.items()]

most_words_df=pd.DataFrame(data=most_common_words_list,columns=["word","count"])
most_words_df.head()

In [None]:
most_words_df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(15,15))
plt.subplot(2,1,1)
sns.barplot(data=most_words_df[:20].sort_values(by="count",ascending=False) ,y="word",x="count",palette="viridis")


labels=most_words_df[:8].sort_values(by="count",ascending=False)["word"].values
sizes=most_words_df[:8].sort_values(by="count",ascending=False)["count"].values
explode = (0.1, 0, 0, 0,0,0,0,0) 
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue',"Cyan","red","orange","Brown"]

plt.figure(figsize=(15,15))
plt.subplot(2,1,2)

plt.pie(sizes,explode=explode,labels=labels,colors=colors,autopct='%1.1f%%', shadow=True,textprops={'fontsize': 10},labeldistance=0.85, startangle=0)
plt.title("most used words")
plt.gca().add_artist(plt.Circle((0,0),0.70,fc='white'))

plt.gcf().set_facecolor('#f0f0f0')

plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud


text=str(list(df["Review"]))
plt.rcParams['figure.figsize'] = (15, 15)
wordcloud = WordCloud(background_color = 'white', width = 1200,  height = 1200, max_words = 121).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

<a id='5'></a>
# Train Test Validation Split

In [None]:
from sklearn.model_selection import train_test_split

train_text,val_text,train_label,val_label=train_test_split(df['lemmatize_text'], df['label'], test_size=0.15, random_state=42)

train_text,test_text,train_label,test_label=train_test_split(train_text, train_label, test_size=0.1, random_state=42)

print("train_text shape: ",train_text.shape)
print("val_text shape:",val_text.shape)
print("test_text shape: ",test_text.shape)

In [None]:
import torch
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bert_tokenize(text):
    tokenized= tokenizer(text.tolist(), padding=True, truncation=True, return_tensors="pt")
    return tokenized

train_tokenized=bert_tokenize(train_text)
val_tokenized=bert_tokenize(val_text)
test_tokenized=bert_tokenize(test_text)

train_label= torch.tensor(train_label.tolist())
val_label= torch.tensor(val_label.tolist())
test_label= torch.tensor(test_label.tolist())

train_dataset= torch.utils.data.TensorDataset(train_tokenized["input_ids"],train_tokenized["attention_mask"],train_label)
val_dataset= torch.utils.data.TensorDataset(val_tokenized["input_ids"],val_tokenized["attention_mask"],val_label)
test_dataset= torch.utils.data.TensorDataset(test_tokenized["input_ids"],test_tokenized["attention_mask"],test_label)

batch_size = 32
train_dataloader= torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader= torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
val_dataloader= torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

<a id='6'></a>
# Model Train

In [None]:
def train(model,train_loader,optimizer,criteron):
    model.train()
    train_loss=0
    correct=0
    total=0
    
    
    
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)  
        labels = batch[2].to(device) 
        
        #input_ids = batch[0]
        #attention_mask = batch[1]
        #labels = batch[2]
       
        
        optimizer.zero_grad()
        outputs=model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = criterion(logits, labels)
        
        loss.backward()
        
        optimizer.step()
        
        
        train_loss+=loss.item()
        _, predicted = logits.max(1)
        total+=labels.size(0)
        correct+=predicted.eq(labels).sum().item()
        
    train_accuracy=100*correct/total
    train_loss/=len(train_loader)
    return train_loss,train_accuracy

In [None]:
def validate(model,valid_loader,criterion):
    model.eval()
    val_loss=0
    correct=0
    total=0
    
    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)  
            labels = batch[2].to(device)
            
            #input_ids = batch[0]
            #attention_mask = batch[1]
            #labels = batch[2]
       
            
            outputs=model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            
            
            val_loss+=loss.item()
            _, predicted = logits.max(1)
            total+=labels.size(0)
            correct+=predicted.eq(labels).sum().item()
            
        val_accuracy = 100.0 * correct / total
        val_loss /= len(valid_loader)
    return val_loss, val_accuracy

In [None]:
from transformers import BertForSequenceClassification
import torch.optim as optim
from torch.nn.parallel import DataParallel


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) 
model =DataParallel(model)
model = model.to(device)

optimizer= optim.Adam(model.parameters(),lr=2e-5)
criterion=torch.nn.CrossEntropyLoss()
epochs=5

In [None]:
train_accuracy=[]
validation_accuracy=[]
train_losses=[]
validation_losses=[]

for epoch in range(epochs):
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    val_loss, val_acc = validate(model, val_dataloader, criterion)
    
    train_accuracy.append(train_acc)
    validation_accuracy.append(val_acc)
    train_losses.append(train_loss)
    validation_losses.append(val_loss)

    
    print(f"Epoch {epoch+1}/{epochs}: Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f} Train Accuracy: {train_acc:.2f}%, Validation Accuracy: {val_acc:.2f}%")

torch.save(model.state_dict(), "bert_model.pth")


In [None]:
file_path = '/kaggle/working/modelsave/'  
os.makedirs(file_path, exist_ok=True)

In [None]:
save1 = model.module
save1.save_pretrained("/kaggle/working/modelsave/")

<a id='7'></a>
# Results Visualization

In [None]:
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots()


ax2 = ax1.twinx()

ax2.plot(np.array(validation_accuracy),label = "Validation Acc",color="green")
ax2.plot(np.array(train_accuracy),label = "Train Acc",color= "red")
ax1.legend()
ax2.legend()
ax1.set_xlabel('Epoch')
ax1.set_yticklabels([])


fig.tight_layout()
plt.title("Train vs Validation Accuracy")
plt.show()

In [None]:
fig, ax1 = plt.subplots()


ax2 = ax1.twinx()

ax2.plot(np.array(validation_losses),label = "Validation Loss",color="green")
ax2.plot(np.array(train_losses),label = "Train Loss",color= "red")
ax1.legend()
ax2.legend()
ax1.set_xlabel('Epoch')
ax1.set_yticklabels([])
fig.tight_layout()
plt.title("Train vs Validation Loss")
plt.show()

In [None]:
model.load_state_dict(torch.load("/kaggle/working/bert_model.pth"))

predictions=[]
actual_labels=[]

model.eval()

with torch.no_grad():
    for batch in test_dataloader:        
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        #input_ids = batch[0]
        #attention_mask = batch[1]
        #labels = batch[2]
       
        
        outputs=model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        _, predicted = logits.max(1)
        
        predictions.extend(predicted.cpu().numpy())
        actual_labels.extend(labels.cpu().numpy())
        
results_df=pd.DataFrame({"Actual":actual_labels,"Predicted":predictions})
results_df

In [None]:
from sklearn.metrics import accuracy_score

true_labels=results_df["Actual"]
predicted_labels=results_df["Predicted"]

accuracy=accuracy_score(true_labels,predicted_labels)

correct_predictions = (true_labels == predicted_labels).sum()
print(f"Total number of correct predictions: {correct_predictions}")
print(f"Accuracy Score: {accuracy}")

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

true_labels=results_df["Actual"]
predicted_labels=results_df["Predicted"]

confusionMatrix = confusion_matrix(true_labels, predicted_labels)
sns.heatmap(confusionMatrix, annot=True, cmap='viridis', fmt='g')

In [None]:
from transformers import BertConfig
config = BertConfig.from_json_file('/kaggle/working/modelsave/config.json')

model =  BertForSequenceClassification.from_pretrained('/kaggle/working/modelsave/', config=config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
df.head()

In [None]:
df.lemmatize_text[0]

In [None]:
df['label_name'].unique()

# Sentiment

In [None]:
df['label_name'].unique()

In [None]:
df.label.values[0]

In [None]:
#text = 'nice hotel expensive parking got good deal stay hotel anniversary arrived late evening took advice previous review valet parking check quick easy little disappointed nonexistent view room room clean nice size bed comfortable woke stiff neck high pillow soundproof like heard music room night morning loud bang door opening closing hear people talking hallway maybe noisy neighbor aveda bath product nice goldfish stay nice touch taken advantage staying longer location great walking distance shopping overall nice experience pay parking night'
text=df.lemmatize_text.values[:30]
def bert_tokenize(text):
    tokenized= tokenizer(text.tolist() ,padding=True, truncation=True, return_tensors="pt")
    return tokenized

text = bert_tokenize(text)
predictions=list()
for i in range(30):
    with torch.no_grad():
        model_output = model(**text[i:i+1]) 
        predicted = torch.softmax(model_output.logits, dim=1).tolist()[0]
        predictions.append(predicted)

label_names=["Negative","Neutral","Positive"]
for i, predicted in enumerate(predictions):
    print("text {}: {}\n".format(i,df.lemmatize_text.iloc[i]))
    for j,label_name in enumerate(label_names):
        
        print(f"{label_name}: {predicted[j] * 100:.2f}%")
        
    print("\nactual label_name:{}, actual rating score:{}\n".format(df.label_name.iloc[i],df.Rating.iloc[i]))
    

# If you like it, you can support it by voting. :))