In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [3]:
# Load the dataset
data = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', 
                   encoding='latin-1', 
                   names=['target', 'ids', 'date', 'flag', 'user', 'text'])
# Use a smaller subset of the data for testing
# data = data.sample(n=1000, random_state=42)

In [4]:
# Extract texts and labels
texts = data['text'].tolist()
labels = data['target'].tolist()

In [5]:
# Convert labels to binary (positive or negative sentiment)
labels = [1 if label == 4 else 0 for label in labels]

# Split the data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:

def tokenize_in_batches(texts, labels, batch_size):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []
    max_len = 0
    
    # Tokenize the data in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_labels = labels[i:i + batch_size]
        
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids_list.append(inputs['input_ids'])
        attention_mask_list.append(inputs['attention_mask'])
        labels_list.append(torch.tensor(batch_labels))
        
        # Update max_len to the maximum length of tokenized sequences
        max_len = max(max_len, inputs['input_ids'].size(1))
        
        print(f'Tokenized batch {i // batch_size + 1}/{(len(texts) + batch_size - 1) // batch_size}')
    
    # Pad all sequences to max_len
    input_ids_padded = [torch.cat([batch, torch.full((batch.size(0), max_len - batch.size(1)), tokenizer.pad_token_id)], dim=1) for batch in input_ids_list]
    attention_masks_padded = [torch.cat([batch, torch.full((batch.size(0), max_len - batch.size(1)), 0)], dim=1) for batch in attention_mask_list]
    
    input_ids = torch.cat(input_ids_padded, dim=0)
    attention_masks = torch.cat(attention_masks_padded, dim=0)
    labels = torch.cat(labels_list, dim=0)
    
    return input_ids, attention_masks, labels


In [None]:
# Tokenize train and test data
batch_size = 1000  # Adjust based on your memory capacity
train_input_ids, train_attention_masks, train_labels = tokenize_in_batches(train_texts, train_labels, batch_size)
test_input_ids, test_attention_masks, test_labels = tokenize_in_batches(test_texts, test_labels, batch_size)


In [8]:
# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [19]:
# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Adjust based on your GPU memory
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [20]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [22]:
for param in model.parameters():
    print(param.device)
    break
print(f'Using device: {device}')

cuda:0
Using device: cuda


In [None]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    print("train-outer-loop")
    for batch in train_dataloader:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        
        optimizer.zero_grad()
        print("train-inner-loop")
        
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}')

In [29]:
# Save the trained model
model_save_path = 'trained_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f'Model saved to {model_save_path}')

Model saved to trained_model


In [30]:
# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

In [31]:
# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

# Classification report
print(classification_report(true_labels, predictions, target_names=['Negative', 'Positive']))

Test Accuracy: 0.8626
              precision    recall  f1-score   support

    Negative       0.87      0.85      0.86    159494
    Positive       0.86      0.87      0.86    160506

    accuracy                           0.86    320000
   macro avg       0.86      0.86      0.86    320000
weighted avg       0.86      0.86      0.86    320000



In [32]:
# Load the trained model for single sentence evaluation
model = BertForSequenceClassification.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(model_save_path)


In [33]:
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [62]:
from bertviz import head_view

def predict_sentiment(sentence):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=240)
    
    # Move tensors to the appropriate device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, output_attentions = True)
        logits = outputs.logits
        attentions = outputs.attentions
    
    # Get the predicted class
    predicted_class = torch.argmax(logits, dim=1).item()

    # Convert the input ids back to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    head_view(attentions, tokens)

    
    return predicted_class

In [63]:
# Example usage
sentence = "India won the worldcup, Virat Kohli and Rohit Sharma retired after winning"
predicted_class = predict_sentiment(sentence)

<IPython.core.display.Javascript object>

In [42]:
# Map the predicted class to the sentiment label
sentiment_label = "Positive" if predicted_class == 1 else "Negative"
print(f'Sentiment: {sentiment_label}')

Sentiment: Negative
