## Importing Libraries

In [68]:
import numpy as np
import pandas as pd

import warnings 
warnings.filterwarnings('ignore')

## Data Loading

In [5]:
from datasets import load_dataset

ds = load_dataset("theArijitDas/Fake-Reviews-Dataset")

README.md:   0%|          | 0.00/854 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/8.29M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40526 [00:00<?, ? examples/s]

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['category', 'rating', 'text', 'label'],
        num_rows: 40526
    })
})

In [7]:
df = ds.copy()

In [15]:
from datasets import DatasetDict

# Step 1: Split into 80% train and 20% test
train_test_split = df['train'].train_test_split(test_size=0.2, seed=42)

# Step 2: Split the 80% train into 70% train and 10% validation
train_valid_split = train_test_split['train'].train_test_split(test_size=0.125, seed=42) 
# 0.125 because 0.125 * 0.8 = 0.1 of original dataset

# Step 3: Combine into DatasetDict
dataset_splits = DatasetDict({
    'train': train_valid_split['train'],         # 70%
    'validation': train_valid_split['test'],     # 10%
    'test': train_test_split['test']             # 20%
})

print(dataset_splits)

DatasetDict({
    train: Dataset({
        features: ['category', 'rating', 'text', 'label'],
        num_rows: 28367
    })
    validation: Dataset({
        features: ['category', 'rating', 'text', 'label'],
        num_rows: 4053
    })
    test: Dataset({
        features: ['category', 'rating', 'text', 'label'],
        num_rows: 8106
    })
})


In [16]:
# Drop the 'category' and 'rating' columns from all splits
dataset_clean = dataset_splits.remove_columns(['category', 'rating'])
print(dataset_clean)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 28367
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4053
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8106
    })
})


## Preprocessing

In [None]:
FacebookAI/roberta-base

In [18]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [21]:
# Define a tokenization function
def preprocess_function(examples):
    # Tokenize the text column, truncating/padding as needed
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization to each split using map()
dataset_dict = dataset_clean.map(preprocess_function, batched=True)


Map:   0%|          | 0/28367 [00:00<?, ? examples/s]

Map:   0%|          | 0/4053 [00:00<?, ? examples/s]

Map:   0%|          | 0/8106 [00:00<?, ? examples/s]

In [22]:
# set the format to pytorch tensors if you want to use Trainer directly
dataset_dict.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [23]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 28367
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4053
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8106
    })
})

In [24]:
def reorder_columns(ds, order):
    data = {k: ds[k] for k in order}
    return ds.from_dict(data)

new_order = ["input_ids", "attention_mask", "label"]

dataset_dict["train"] = reorder_columns(dataset_dict["train"], new_order)
dataset_dict["validation"] = reorder_columns(dataset_dict["validation"], new_order)
dataset_dict["test"] = reorder_columns(dataset_dict["test"], new_order)

print(dataset_dict["train"].column_names)


['input_ids', 'attention_mask', 'label']


In [25]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'label'],
        num_rows: 28367
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'label'],
        num_rows: 4053
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'label'],
        num_rows: 8106
    })
})

In [26]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

2025-08-11 16:30:13.808867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754929813.995040      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754929814.052687      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Model Building

In [30]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'FacebookAI/roberta-base',
    num_labels=2
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments("/kaggle/working/")  #chaning working directory as per need

training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    report_to="none",  
    logging_dir="/kaggle/working/logs",       
    save_strategy="epoch",               
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [32]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    processing_class=tokenizer,
    data_collator=data_collator
)

In [33]:
trainer.train()

Step,Training Loss
500,0.272
1000,0.1562
1500,0.156
2000,0.0896
2500,0.0638
3000,0.0646
3500,0.0542
4000,0.0271
4500,0.0222
5000,0.0187


TrainOutput(global_step=5319, training_loss=0.08828039549418602, metrics={'train_runtime': 1125.4734, 'train_samples_per_second': 75.614, 'train_steps_per_second': 4.726, 'total_flos': 5597753480547840.0, 'train_loss': 0.08828039549418602, 'epoch': 3.0})

## Inference

In [34]:
predictions = trainer.predict(dataset_dict["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

#predictions = trainer.predict(dataset_dict["test"])
#print(predictions.predictions.shape, predictions.label_ids.shape)

(4053, 2) (4053,)


In [35]:
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

In [36]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
b

In [37]:
import evaluate

# Load all required metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
#precision = evaluate.load("precision")
#recall = evaluate.load("recall")

# Compute predictions and labels
acc_score = accuracy.compute(predictions=preds, references=labels)
f1_score = f1.compute(predictions=preds, references=labels, average="weighted")
#precision_score = precision.compute(predictions=preds, references=labels, average="weighted")
#recall_score = recall.compute(predictions=preds, references=labels, average="weighted")

# Print results
print("Accuracy:", acc_score)
print("F1 Score:", f1_score)
#print("Precision:", precision_score)
#print("Recall:", recall_score)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Accuracy: {'accuracy': 0.9726128793486306}
F1 Score: {'f1': 0.9726027737747962}


In [38]:
#Save the model
model.save_pretrained("/kaggle/working/quality_check_model")
tokenizer.save_pretrained("/kaggle/working/quality_check_tokenizer")

('/kaggle/working/quality_check_tokenizer/tokenizer_config.json',
 '/kaggle/working/quality_check_tokenizer/special_tokens_map.json',
 '/kaggle/working/quality_check_tokenizer/vocab.json',
 '/kaggle/working/quality_check_tokenizer/merges.txt',
 '/kaggle/working/quality_check_tokenizer/added_tokens.json',
 '/kaggle/working/quality_check_tokenizer/tokenizer.json')

## Push to hugging face

In [39]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [40]:
model.push_to_hub("suryaummadi/review-roberta-quality-scoring-analytics")
tokenizer.push_to_hub("suryaummadi/review-roberta-quality-scoring-analytics")

Uploading...:   0%|          | 0.00/499M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/suryaummadi/review-roberta-quality-scoring-analytics/commit/33d6e893a4d70f5c8a1ba868d2c3f15bea5759a4', commit_message='Upload tokenizer', commit_description='', oid='33d6e893a4d70f5c8a1ba868d2c3f15bea5759a4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/suryaummadi/review-roberta-quality-scoring-analytics', endpoint='https://huggingface.co', repo_type='model', repo_id='suryaummadi/review-roberta-quality-scoring-analytics'), pr_revision=None, pr_num=None)

## Loading from hugging face and working

In [62]:
from transformers import pipeline

# Load your fine-tuned fake vs original review classification model
pipe = pipeline(
    "text-classification",
    model="suryaummadi/review-roberta-quality-scoring-analytics",
    return_all_scores=True
)

def classify_review(text):
    outputs = pipe(text)
    scores = outputs[0]  
    
    # Find label with highest score
    top = max(scores, key=lambda x: x['score'])
    label = top['label']  # 'LABEL_0' or 'LABEL_1'
    score = top['score'] * 100

    if label == 'LABEL_0':
        result = "Original (authentic) review"
    else:
        result = "Fake (computer-generated) review"

    #return f"Prediction: {result} with confidence {score:.1f}%."
    return f"Prediction: {result}."

# Example usage
review = "I recently ordered a new pair of headphones for my son's new computer, and I was pretty nervous about them.  The headphones fit him fine, but he didn't like the way they felt.  After a couple of days, they started to feel very uncomfortable.  The sound quality is a bit muddy, but the bass is good and the mids are good.  I'd rather have my son use them for listening to music on his computer, but that's not what I wanted for his new computer.\n\nThe sound quality is okay, but it isn't bad either.  I've found that it is"
print(classify_review(review))

review2 = "This is the best product ever! Buy it now!!!"
print(classify_review(review2))

Device set to use cuda:0


Prediction: Fake (computer-generated) review.
Prediction: Original (authentic) review.


## End