In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install Transformers library
!pip install transformers

Mounted at /content/drive


In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CIS509/Restaurants_Data_2018.csv')

# Drop unnecessary columns
df = df[['text', 'name', 'is_open']]

# Group reviews by restaurant name
grouped_reviews = df.groupby('name')['text'].apply(lambda x: ' '.join(x)).reset_index()

# Merge with is_open column
grouped_data = grouped_reviews.merge(df[['name', 'is_open']], on='name').drop_duplicates()

# Convert is_open to binary labels
grouped_data['is_open'] = grouped_data['is_open'].astype(int)


In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(grouped_data['text'],
                                                                      grouped_data['is_open'],
                                                                      test_size=0.2,
                                                                      random_state=42)

In [5]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
import tensorflow as tf

# Convert labels to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))


In [9]:
!pip install bertopic

import locale
locale.getpreferredencoding = lambda: "UTF-8"

from bertopic import BERTopic



In [11]:
from transformers import TFBertForSequenceClassification
learning_rates = [ 0.00002, 0.000002]
accuracies = []

for lr in learning_rates:
    # Fine-tune BERT model
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Using Adam optimizer with specified learning rate
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    # Using sparse categorical cross-entropy loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Sparse categorical cross-entropy loss function

    metrics = ['accuracy']
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    # Fine-tune the model on the downstream task
    history = model.fit(train_dataset.shuffle(1000).batch(8), epochs=3, batch_size=8, verbose=0)

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(test_dataset.batch(16), verbose=0)
    accuracies.append(test_accuracy)

# Print accuracies for each learning rate
for lr, acc in zip(learning_rates, accuracies):
    print(f'Learning Rate: {lr}, Test Accuracy: {acc}')


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Learning Rate: 2e-05, Test Accuracy: 0.7401574850082397
Learning Rate: 2e-06, Test Accuracy: 0.7401574850082397


In [12]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset.batch(8), verbose=2)
print(f'Test Accuracy: {test_accuracy}')


16/16 - 5s - loss: 0.5574 - accuracy: 0.7402 - 5s/epoch - 331ms/step
Test Accuracy: 0.7401574850082397


In [None]:
# Optionally, save the model for future use
# model.save_pretrained('/content/drive/My Drive/bert_model')


In [13]:
import numpy as np
from sklearn.metrics import classification_report

# Make predictions
predictions = model.predict(test_dataset.batch(16))
predicted_labels = np.argmax(predictions.logits, axis=1)

# Generate classification report
print(classification_report(test_labels, predicted_labels))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        33
           1       0.74      1.00      0.85        94

    accuracy                           0.74       127
   macro avg       0.37      0.50      0.43       127
weighted avg       0.55      0.74      0.63       127



In [17]:
topic_model = BERTopic(language="english")
topics, probabilities = topic_model.fit_transform(grouped_reviews['text'])


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
topic_freq = topic_model.get_topic_info()
top_5_topics = topic_freq.head(6)  # Includes the -1 topic which is for outliers

for topic in top_5_topics['Topic'][1:]:  # Excluding -1 if present
    print(f"Topic {topic} Overview:")
    print("Words and Weights:", topic_model.get_topic(topic))
    print("Representative Document:", topic_model.get_representative_docs(topic))
    print("\n")

Topic 0 Overview:
Words and Weights: [('taco', 0.05308446328229343), ('salsa', 0.03187793789097344), ('food', 0.03147385713370956), ('mexican', 0.027112915205173907), ('burrito', 0.025667169087116647), ('place', 0.024100052392378335), ('good', 0.02275853800884453), ('great', 0.01969361374967897), ('one', 0.016079618486939503), ('get', 0.015935989038223446)]


Topic 1 Overview:
Words and Weights: [('bowl', 0.031315302106876615), ('ramen', 0.02967151649632369), ('food', 0.029258254565647845), ('noodle', 0.027214879816444167), ('place', 0.026191512639439744), ('good', 0.02372244750736476), ('thai', 0.020828037559523876), ('rice', 0.018066172340120812), ('spicy', 0.017849202604052528), ('soup', 0.01733418125073179)]
Representative Document: ['great flavored food good service spring roll awesome seen review stating food greasy cooked perfectly fresh tasting good tom ka gai curry dish food overall pretty good like come thai fix pretty good consistent walk house mood thai definitely satisfies

In [19]:
topic_model.visualize_barchart(top_n_topics=5)


In [20]:
topic_model.visualize_topics()


**Topic modeling for closed restuarants reviews**

In [31]:
closed_restaurants_reviews = grouped_data[grouped_data['is_open']== 0]['text']

topic_model = BERTopic(language="english")
topic_model.calculate_probabilities = True
topics, probabilities = topic_model.fit_transform(closed_restaurants_reviews)

topic_freq = topic_model.get_topic_info()
top_5_topics = topic_freq.head(6)  # Includes the -1 topic which is for outliers

for topic in top_5_topics['Topic'][1:]:  # Excluding -1 if present
    print(f"Topic {topic} Overview:")
    print("Words and Weights:", topic_model.get_topic(topic))
    print("Representative Document:", topic_model.get_representative_docs(topic))
    print("\n")


topic_model.visualize_barchart(top_n_topics=5)


Topic 0 Overview:
Words and Weights: [('food', 0.05322277601516285), ('place', 0.04013378422408705), ('good', 0.038489046095754466), ('great', 0.03326437217830497), ('service', 0.02980050495091607), ('time', 0.02550333590536123), ('restaurant', 0.024654759988586763), ('back', 0.024144124420986582), ('one', 0.02392821106955637), ('like', 0.02384763353853834)]
Representative Document: ['yummmmmmmm boyfriend checked place whim smelled good walked got simple stuff even tho menu expansive phenomenal theyve homemade salad dressing bought bottle homemade daily soup waitress pam ray sunshine made whole experience much better plus place beautiful inside reasonably priced id definitely recommend wonderful food terrible customer service lied asked seated towards back said space sat u door cold went looked back room enough room back silverware gross waitress rude laura back game meal excellent service excellent great garlic bread well back often like location decor cute date food decent nothing sp

In [32]:
topic_model.visualize_hierarchy(top_n_topics=6)

In [33]:
print(topic_model.calculate_probabilities)
topic_model.calculate_probabilities = True

True


In [34]:
topic_model.visualize_distribution(probabilities[1],0.0002) #min_probability=0.0002

**topic modeling for Open restuarants reviews**

In [35]:
open_restaurants_reviews = grouped_data[grouped_data['is_open']== 0]['text']

topic_model = BERTopic(language="english")
topic_model.calculate_probabilities = True
topics, probabilities = topic_model.fit_transform(open_restaurants_reviews)

topic_freq = topic_model.get_topic_info()
top_5_topics = topic_freq.head(6)  # Includes the -1 topic which is for outliers

for topic in top_5_topics['Topic'][1:]:  # Excluding -1 if present
    print(f"Topic {topic} Overview:")
    print("Words and Weights:", topic_model.get_topic(topic))
    print("Representative Document:", topic_model.get_representative_docs(topic))
    print("\n")


topic_model.visualize_barchart(top_n_topics=5)


Topic 0 Overview:
Words and Weights: [('food', 0.052032687405809575), ('place', 0.04025956829347184), ('good', 0.03826316503467018), ('great', 0.033372005546988895), ('service', 0.02999981921508981), ('time', 0.025481322725764945), ('restaurant', 0.024666939632407164), ('like', 0.024292877567783835), ('back', 0.024051285052350598), ('one', 0.023510312627112215)]
Representative Document: ['yummmmmmmm boyfriend checked place whim smelled good walked got simple stuff even tho menu expansive phenomenal theyve homemade salad dressing bought bottle homemade daily soup waitress pam ray sunshine made whole experience much better plus place beautiful inside reasonably priced id definitely recommend wonderful food terrible customer service lied asked seated towards back said space sat u door cold went looked back room enough room back silverware gross waitress rude laura back game meal excellent service excellent great garlic bread well back often like location decor cute date food decent nothin

In [37]:
topic_model.visualize_hierarchy(top_n_topics=6)


In [38]:
topic_model.visualize_distribution(probabilities[1],0.0002) #min_probability=0.0002