In [16]:
# Step 2: Prepare the dataset
import pandas as pd

# Load your Yelp reviews data into a DataFrame
# Assuming the data has two columns: 'text' and 'sentiment'
# 'sentiment' should be a binary label (0 for negative, 1 for positive)
data = pd.read_csv("wikiDetoxAnnotated40kRows.tsv", sep='\t')
data.dropna(inplace=True)
data.isnull().sum()
data.head()

Unnamed: 0,Label,rev_id,comment,year,logged_in,ns,sample,split
0,0,666674821.0,==He is a Rapist!!!!!== Please edit the arti...,2015,True,article,blocked,train
1,0,24297552.0,The other two films Hitch and Magnolia are als...,2005,False,article,random,train
2,0,329801523.0,== blocking people == how can i block lambs...,2009,True,user,random,test
3,1,391141109.0,== Are you a female????? == mootmootmootmoo...,2010,True,user,blocked,train
4,0,426027506.0,:::::Good enough for me; thanks for the info....,2011,True,article,random,train


In [17]:

# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Tokenize and encode the text data
from transformers import AutoTokenizer

# Use a pre-trained tokenizer; you can replace 'distilbert-base-uncased' with other models
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def encode_data(tokenizer, text, max_length):
    return tokenizer.batch_encode_plus(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )

max_length = 256
train_encodings = encode_data(tokenizer, train_data["comment"].tolist(), max_length)
print(train_encodings[0])
val_encodings = encode_data(tokenizer, val_data["comment"].tolist(), max_length)

# Step 4: Create DataLoader
import torch
from torch.utils.data import Dataset, DataLoader

class YelpReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = YelpReviewsDataset(train_encodings, train_data["Label"].tolist())
val_dataset = YelpReviewsDataset(val_encodings, val_data["Label"].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Step 5: Define the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Step 6: Train the model
from transformers import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()



Encoding(num_tokens=256, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

KeyboardInterrupt: 

In [None]:

# Step 7: Evaluate the model
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

model.eval()
predictions, true_labels = [], []

for batch in tqdm(val_loader):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

preds = torch.argmax(logits, dim=1).cpu().numpy()
labels = labels.cpu().numpy()

predictions.extend(preds)
true_labels.extend(labels)


accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.4f}")

report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])
print(report)