In [1]:
import pandas as pd
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
def review_data(data_file):
    df = pd.read_csv(data_file,encoding='utf-8')
    texts = df['texts'].tolist()
    labels = df['labels'].tolist()
    return texts, labels

In [4]:
data_file = 'cleaned.csv'
texts, labels = review_data(data_file)

In [5]:
print(f"Total texts: {len(texts)}, Total labels: {len(labels)}")

Total texts: 40432, Total labels: 40432


In [6]:
#tokenizing, handling the seq_length, and providing with input IDs, attention masks, and labels
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

  def __len__(self):
        return len(self.texts)

  def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        try:
            encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        except ValueError as e:
            print(f"Skipping example at index {idx}: {e}")
            return None

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [8]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # Correct key is 'labels'
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [9]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)  # Correct key is 'labels'
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [10]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Filter out None items
    batch = [item for item in batch if item is not None]

    if not batch:  # Handle the case where all items in the batch are None
        return None

    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad sequences to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    # Convert labels to a tensor
    labels = torch.tensor(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

In [11]:
def prediction(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

    return "Authentic Review" if preds.item() == 1 else "Computer Generated Review"

In [12]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 1e-5

In [13]:
train_texts, rem_texts, train_labels, rem_labels = train_test_split(texts, labels, train_size=0.6, random_state=42)

# Then split the remaining data into 50% validation and 50% test (which is 20% of total each)
val_texts, test_texts, val_labels, test_labels = train_test_split(rem_texts, rem_labels, test_size=0.5, random_state=42)

In [14]:
print(len(train_texts))
print(len(train_labels))
print(len(val_texts))
print(len(val_labels))

24259
24259
8086
8086


In [15]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Create datasets
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("true" if torch.cuda.is_available() else "false")
model = BERTClassifier(bert_model_name, num_classes).to(device)

true


In [17]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [18]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Validation Accuracy: 0.9434
              precision    recall  f1-score   support

           0       0.90      0.99      0.95      4030
           1       0.99      0.89      0.94      4056

    accuracy                           0.94      8086
   macro avg       0.95      0.94      0.94      8086
weighted avg       0.95      0.94      0.94      8086

Epoch 2/4
Validation Accuracy: 0.9686
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4030
           1       0.99      0.95      0.97      4056

    accuracy                           0.97      8086
   macro avg       0.97      0.97      0.97      8086
weighted avg       0.97      0.97      0.97      8086

Epoch 3/4
Validation Accuracy: 0.9531
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      4030
           1       0.99      0.91      0.95      4056

    accuracy                           0.95      8086
   macro avg       0.96 

In [19]:
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_accuracy, test_report = evaluate(model, test_dataloader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(test_report)

Test Accuracy: 0.9604
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      4120
           1       0.99      0.93      0.96      3967

    accuracy                           0.96      8087
   macro avg       0.96      0.96      0.96      8087
weighted avg       0.96      0.96      0.96      8087



In [20]:
test_text = "I recently bought this smartwatch, and it's been a game-changer for my daily routine. The fitness tracking features are accurate and comprehensive, covering everything from steps to heart rate monitoring. The battery life is impressive, lasting several days on a single charge. Plus, the sleek design makes it a stylish accessory for any outfit. Highly recommend!"
predict = prediction(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted sentiment: {predict}")

I recently bought this smartwatch, and it's been a game-changer for my daily routine. The fitness tracking features are accurate and comprehensive, covering everything from steps to heart rate monitoring. The battery life is impressive, lasting several days on a single charge. Plus, the sleek design makes it a stylish accessory for any outfit. Highly recommend!
Predicted sentiment: Authentic Review


In [21]:
#Saving the model
torch.save(model.state_dict(), 'bert_classifier_model.pth')