In [None]:
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd


In [1]:
import pandas as pd

In [5]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Loading data

In [4]:
data_fake_reviews = pd.read_csv('../raw_data/fake reviews dataset.csv')

In [5]:
data_fake_reviews.shape

(40432, 4)

In [6]:
data_fake_reviews.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [7]:
data_fake_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [8]:
data_fake_reviews.label.unique()

array(['CG', 'OR'], dtype=object)

In [9]:
data_fake_reviews.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

# Obtain the training dataset and validation dataset

In [50]:
dict_target = {'CG':0, 'OR':1}
def encode_label(x):
    return dict_target.get(x,-1)

In [51]:
data_fake_reviews["target"] = data_fake_reviews["label"].apply(lambda x: encode_label(x))

In [52]:
# Split the data into training and validation sets
train_dataset, valid_dataset = train_test_split(data_fake_reviews, test_size=0.2, shuffle=True, stratify=None, random_state=2021)

In [53]:
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [54]:
# Tokenize the data
train_encodings = tokenizer(train_dataset['text_'].tolist(), truncation=True, padding=True, max_length=256)
valid_encodings = tokenizer(valid_dataset['text_'].tolist(), truncation=True, padding=True, max_length=256)

In [55]:
# Convert the data to TensorFlow tensors
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_dataset['target'].values
))
valid_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    valid_dataset['target'].values
))

# Load the model

In [56]:
# Load the model
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [57]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [58]:
# Train the model
model.fit(train_dataset.shuffle(1000).batch(16), epochs=1, batch_size=16, validation_data=valid_dataset.batch(16))



<keras.callbacks.History at 0x7f20f1b22f20>

In [62]:
from sklearn.metrics import classification_report
import numpy as np
# Make predictions on the validation dataset
valid_dataset_batched = valid_dataset.batch(16)
y_pred_raw = model.predict(valid_dataset_batched)

# Convert raw predictions to class predictions
y_pred = np.argmax(y_pred_raw.logits, axis=1)

# Get the actual class labels
y_true = []
for features, label in valid_dataset:
    y_true.append(label.numpy())
y_true = np.array(y_true)


# Generate a classification report
report = classification_report(y_true, y_pred, target_names=["CG", "OR"])  


              precision    recall  f1-score   support

          CG       0.96      0.99      0.97      4010
          OR       0.99      0.95      0.97      4077

    accuracy                           0.97      8087
   macro avg       0.97      0.97      0.97      8087
weighted avg       0.97      0.97      0.97      8087



In [65]:
model.save('my_model_fake_reviews')



