In [40]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install Transformers library
!pip install transformers

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/drive/My Drive/Restaurants_Data_2018.csv')

# Drop unnecessary columns
df = df[['text', 'name', 'is_open']]

# Group reviews by restaurant name
grouped_reviews = df.groupby('name')['text'].apply(lambda x: ' '.join(x)).reset_index()

# Merge with is_open column
grouped_data = grouped_reviews.merge(df[['name', 'is_open']], on='name').drop_duplicates()

# Convert is_open to binary labels
grouped_data['is_open'] = grouped_data['is_open'].astype(int)


In [42]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(grouped_data['text'],
                                                                      grouped_data['is_open'],
                                                                      test_size=0.2,
                                                                      random_state=42)

In [43]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)


In [44]:
import tensorflow as tf

# Convert labels to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))


In [46]:
learning_rates = [ 0.00002, 0.000002]
accuracies = []

for lr in learning_rates:
    # Fine-tune BERT model
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Using Adam optimizer with specified learning rate
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    # Using sparse categorical cross-entropy loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Sparse categorical cross-entropy loss function

    metrics = ['accuracy']
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    # Fine-tune the model on the downstream task
    history = model.fit(train_dataset.shuffle(1000).batch(8), epochs=3, batch_size=8, verbose=0)

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(test_dataset.batch(16), verbose=0)
    accuracies.append(test_accuracy)

# Print accuracies for each learning rate
for lr, acc in zip(learning_rates, accuracies):
    print(f'Learning Rate: {lr}, Test Accuracy: {acc}')


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Learning Rate: 2e-05, Test Accuracy: 0.7401574850082397
Learning Rate: 2e-06, Test Accuracy: 0.7401574850082397


In [47]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset.batch(8), verbose=2)
print(f'Test Accuracy: {test_accuracy}')


16/16 - 5s - loss: 0.5571 - accuracy: 0.7402 - 5s/epoch - 339ms/step
Test Accuracy: 0.7401574850082397


In [48]:
# Optionally, save the model for future use
# model.save_pretrained('/content/drive/My Drive/bert_model')


In [49]:
import numpy as np
from sklearn.metrics import classification_report

# Make predictions
predictions = model.predict(test_dataset.batch(16))
predicted_labels = np.argmax(predictions.logits, axis=1)

# Generate classification report
print(classification_report(test_labels, predicted_labels))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        33
           1       0.74      1.00      0.85        94

    accuracy                           0.74       127
   macro avg       0.37      0.50      0.43       127
weighted avg       0.55      0.74      0.63       127



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
%%shell
jupyter nbconvert --to html /content/LA5_Tyagi_Upmanyu.ipynb