In [37]:
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd


# Loading data

In [48]:
data_fake_reviews = pd.read_csv('../raw_data/fake reviews dataset.csv')

In [39]:
data_fake_reviews.shape

(40432, 4)

In [40]:
data_fake_reviews.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [41]:
data_fake_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [42]:
data_fake_reviews.label.unique()

array(['CG', 'OR'], dtype=object)

In [43]:
data_fake_reviews.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

# Obtain the training dataset and validation dataset

In [49]:
dict_target = {'CG':0, 'OR':1}
def encode_label(x):
    return dict_target.get(x,-1)

In [50]:
data_fake_reviews["target"] = data_fake_reviews["label"].apply(lambda x: encode_label(x))

In [51]:
# Split the data into training and validation sets
train_dataset, valid_dataset = train_test_split(data_fake_reviews, test_size=0.2, shuffle=True, stratify=None, random_state=2021)

In [52]:
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [53]:
# Tokenize the data
train_encodings = tokenizer(train_dataset['text_'].tolist(), truncation=True, padding=True, max_length=256)
valid_encodings = tokenizer(valid_dataset['text_'].tolist(), truncation=True, padding=True, max_length=256)

In [54]:
# Convert the data to TensorFlow tensors
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_dataset['target'].values
))
valid_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    valid_dataset['target'].values
))


# Load the model

In [55]:
# Load the model
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')



Downloading tf_model.h5:   0%|          | 0.00/657M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

In [58]:

# Train the model
model.fit(train_dataset.shuffle(1000).batch(32), epochs=1, batch_size=32, validation_data=valid_dataset.batch(32))

 162/1011 [===>..........................] - ETA: 5:25:59 - loss: 0.1194 - accuracy: 0.9547

KeyboardInterrupt: 