In [76]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [77]:
from datasets import load_dataset, DatasetDict

import re
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy

from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

In [78]:
# Set the random seed
seed = 2023
np.random.seed(seed)
tf.random.set_seed(seed)

In [79]:
data_files = {x:f'/content/drive/MyDrive/datasets/yelp-data-v2/yelp-classification-{x}.csv' for x in ['test', 'train', 'validation']}
data = load_dataset("csv", data_files=data_files)



  0%|          | 0/3 [00:00<?, ?it/s]

In [80]:
checkpoint = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

with tf.device('GPU'):
    model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [81]:
data = data.remove_columns(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date', 'count_review', 'length_review'])

In [82]:
max_input_length = 400

def tokenizar_data(example):
    return tokenizer(example['new_text'], max_length=max_input_length, truncation=True, padding='max_length')

In [83]:
data = data.map(tokenizar_data, batched=True)

Map:   0%|          | 0/1228 [00:00<?, ? examples/s]

Map:   0%|          | 0/21960 [00:00<?, ? examples/s]



In [84]:
data

DatasetDict({
    test: Dataset({
        features: ['labels', 'new_text', 'input_ids', 'attention_mask'],
        num_rows: 1228
    })
    train: Dataset({
        features: ['labels', 'new_text', 'input_ids', 'attention_mask'],
        num_rows: 21960
    })
    validation: Dataset({
        features: ['labels', 'new_text', 'input_ids', 'attention_mask'],
        num_rows: 1228
    })
})

In [85]:
tf_train_dataset = data["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = data["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test_dataset = data["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [86]:
batch_size = 8
num_epochs = 2

num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

model.compile(
    optimizer=opt,
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"])

model.summary()

Model: "tf_roberta_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 124055040 
 )                                                               
                                                                 
 classifier (TFRobertaClassi  multiple                 592899    
 ficationHead)                                                   
                                                                 
Total params: 124,647,939
Trainable params: 124,647,939
Non-trainable params: 0
_________________________________________________________________


In [87]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f55ddccd7b0>

In [88]:
model.save_pretrained('/content/drive/MyDrive/datasets/yelp-model-v2')

In [96]:
def use_model(example, tokenizer=tokenizer, model=model):
    input_data = tokenizer(example, return_tensors='tf')
    outputs = model(**input_data)

    predicted_label = int(np.argmax(outputs.logits, axis=1))
    labels = ['cool', 'funny', 'useful']
    return labels[predicted_label]

In [112]:
example = data['test'][64]['new_text']
print(f'REVIEW:\n{example}')
print()
use_model(example)

REVIEW:
The best car buying experience I have ever had. I submitted for a quote online and very quickly received a response from <PERSON> the internet sales manager. After checking several different dealerships and exploring different models I found that <PERSON> <PERSON> was by far the best deal on the market. Upon arriving at the dealership I was very sad to find out that someone had beat me to the car I wanted but <PERSON> and his team were great to honor the deal on any new car of the same model. I was set on one color and there were none on the lot but there was one just be delivered, so they brought a car off the delivery truck for me. It was so new, it did n't have air in the tires or oil in the engine yet! As we waited for them to prep the car I wanted, I test drove the same model in a different color. I told <PERSON> where my payments must be in order to make a deal and he worked very hard to get us there. When we arrived at a deal <PERSON> bought us lunch at the lunch counter

'cool'