In [1]:
pip install torch transformers datasets scikit-learn pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('final_dataset.csv')

# Assuming the dataset has 'review' and 'label' columns
# Convert string labels to numeric
label_mapping = {'real': 1, 'fake': 0}
df['target'] = df['target'].map(label_mapping)

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [4]:
pip install sentencepiece




In [5]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification

# Load the tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Load the model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_reviews(reviews):
    return tokenizer(reviews, padding=True, truncation=True, return_tensors='pt')

train_encodings = tokenize_reviews(train_df['text'].tolist())
test_encodings = tokenize_reviews(test_df['text'].tolist())


In [7]:
import torch
from torch.utils.data import DataLoader, Dataset

class YelpDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = YelpDataset(train_encodings, train_df['target'].tolist())
test_dataset = YelpDataset(test_encodings, test_df['target'].tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [8]:
import torch
from torch.optim import AdamW

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)


In [10]:
from transformers import get_scheduler

def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Move batch to the specified device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_loader)
    return avg_loss


In [11]:
from sklearn.metrics import accuracy_score

def evaluate(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels.extend(batch['labels'].cpu().numpy())
    
    avg_loss = total_loss / len(test_loader)
    accuracy = accuracy_score(labels, preds)
    return avg_loss, accuracy


In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [13]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Train Loss: {train_loss}")
    
    eval_loss, eval_accuracy = evaluate(model, test_loader, device)
    print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
    
    scheduler.step()


Epoch 1/3


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

In [9]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer

# Load the tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Load the model with the correct number of labels for binary classification
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(3):  # Adjust the number of epochs as necessary
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

In [None]:
from transformers import AdamW

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(3):  # Adjust the number of epochs as necessary
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


In [None]:
def create_datasets(train_encodings, train_labels, test_encodings, test_labels):
    train_dataset = YelpDataset(train_encodings, train_labels)
    test_dataset = YelpDataset(test_encodings, test_labels)
    return train_dataset, test_dataset

# Example data (replace with your actual data)
train_encodings = {'input_ids': [[101, 2057, 2031, 1037, 2307, 2154, 102], [101, 1045, 2293, 2023, 3185, 102]],
                   'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}
train_labels = [1, 0]

test_encodings = {'input_ids': [[101, 2057, 2031, 1037, 2307, 2154, 102], [101, 1045, 2293, 2023, 3185, 102]],
                  'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}
test_labels = [1, 0]

# Create datasets
train_dataset, test_dataset = create_datasets(train_encodings, train_labels, test_encodings, test_labels)

# Print types of labels in train and test datasets
print(f"Type of train labels: {type(train_labels[0])}")
print(f"Type of test labels: {type(test_labels[0])}")

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(3):  # Adjust the number of epochs as necessary
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

In [None]:
from transformers import AdamW

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
# Training loop
model.train()
for epoch in range(3):  # Adjust the number of epochs as necessary
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


In [None]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


In [None]:
model.save_pretrained('path_to_save_model')
tokenizer.save_pretrained('path_to_save_tokenizer')
