In [1]:
import pandas as pd
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_csv('final_dataset.csv')
df_sampled = df

# Split the data into training and testing sets
X = df_sampled['text']
y = df_sampled['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer.batch_encode_plus(X_train.tolist(), 
                                              add_special_tokens=True, 
                                              max_length=128,  # Reduced max_length for efficiency
                                              padding='max_length', 
                                              truncation=True, 
                                              return_attention_mask=True, 
                                              return_tensors='pt')

test_encodings = tokenizer.batch_encode_plus(X_test.tolist(), 
                                             add_special_tokens=True, 
                                             max_length=128,  # Reduced max_length for efficiency
                                             padding='max_length', 
                                             truncation=True, 
                                             return_attention_mask=True, 
                                             return_tensors='pt')


In [4]:
label_mapping = {'fake': 0, 'genuine': 1}
y_train_num = [label_mapping[label] for label in y_train]
y_test_num = [label_mapping[label] for label in y_test]


In [5]:
class YelpDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {k: v.clone().detach() for k, v in encodings.items()}
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [6]:
train_dataset = YelpDataset(train_encodings, y_train_num)
test_dataset = YelpDataset(test_encodings, y_test_num)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Reduced batch_size for smaller data
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [7]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def train(model, device, loader, optimizer, epoch):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        # Calculate training accuracy
        predictions = torch.argmax(outputs.logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)
    
    avg_loss = total_loss / len(loader)
    accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
    return avg_loss, accuracy


In [9]:
def evaluate(model, device, loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

In [10]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [11]:
train_losses = []
train_accuracies = []


In [None]:
for epoch in range(3):  # Reduced number of epochs for the smaller dataset
    avg_loss, accuracy = train(model, device, train_loader, optimizer, epoch)
    train_losses.append(avg_loss)
    train_accuracies.append(accuracy)


In [None]:
predictions, true_labels = evaluate(model, device, test_loader)
test_accuracy = accuracy_score(y_test_num, predictions)
print(f'Final Test Accuracy: {test_accuracy:.4f}')