In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentimentanalysis/task3/train3.csv
/kaggle/input/sentimentanalysis/task3/test3.csv


In [2]:
# Kaggle optimizations
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'

In [3]:
# Check GPU availability and type
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Device:", torch.cuda.get_device_name(0))
print("Number of GPUs:", torch.cuda.device_count())

CUDA Available: True
GPU Device: Tesla T4
Number of GPUs: 2


In [4]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch.nn.functional as F
from transformers import AdamW
from tqdm import tqdm

In [5]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [6]:
def load_data(filepath):
    df = pd.read_csv(filepath)
    # Remove any NaN values
    df = df.dropna()
    # Map sentiment labels to appropriate format (-1 -> 0, 0 -> 1, 1 -> 2)
    df['category'] = df['category'].map({-1: 0, 0: 1, 1: 2})
    return df


In [10]:
# Custom Dataset class for our Twitter data
class TwitterDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [11]:
# Training function
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(data_loader, desc='Training')
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(data_loader)


In [12]:
# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
    
    return predictions, actual_labels

In [13]:
# Main execution
def main():
    # Load the data
    df = load_data('/kaggle/input/sentimentanalysis/task3/train3.csv')
    
    # Initialize tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    
    # Split the data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['Text'].values,
        df['category'].values,
        test_size=0.1,
        random_state=42,
        stratify=df['category'].values
    )
    
    # Create datasets
    train_dataset = TwitterDataset(train_texts, train_labels, tokenizer)
    val_dataset = TwitterDataset(val_texts, val_labels, tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    
    # Initialize model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=3
    ).to(device)
    
    # Calculate class weights to handle imbalance
    class_counts = df['category'].value_counts().sort_index()
    class_weights = torch.tensor(
        [1.0 / (count / len(df)) for count in class_counts],
        dtype=torch.float
    ).to(device)
    
    # Initialize optimizer with weight decay
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    
    # Training loop
    num_epochs = 3
    best_val_f1 = 0
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, device)
        print(f"Average training loss: {train_loss:.4f}")
        
        # Evaluate
        val_preds, val_labels = evaluate(model, val_loader, device)
        
        # Print metrics
        print("\nValidation Results:")
        print(classification_report(val_labels, val_preds))
        
        # Save best model
        val_f1 = classification_report(val_labels, val_preds, output_dict=True)['macro avg']['f1-score']
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_model.pt')
            print("New best model saved!")

if __name__ == "__main__":
    main()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 8679/8679 [29:11<00:00,  4.96it/s, loss=0.00688]


Average training loss: 0.1953


Evaluating: 100%|██████████| 483/483 [00:59<00:00,  8.15it/s]



Validation Results:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      3373
           1       0.99      0.98      0.98      5232
           2       0.97      0.98      0.98      6823

    accuracy                           0.97     15428
   macro avg       0.97      0.97      0.97     15428
weighted avg       0.97      0.97      0.97     15428

New best model saved!

Epoch 2/3


Training: 100%|██████████| 8679/8679 [29:16<00:00,  4.94it/s, loss=0.145]   


Average training loss: 0.0679


Evaluating: 100%|██████████| 483/483 [00:58<00:00,  8.23it/s]



Validation Results:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      3373
           1       0.99      0.98      0.99      5232
           2       0.99      0.98      0.98      6823

    accuracy                           0.98     15428
   macro avg       0.98      0.98      0.98     15428
weighted avg       0.98      0.98      0.98     15428

New best model saved!

Epoch 3/3


Training: 100%|██████████| 8679/8679 [29:16<00:00,  4.94it/s, loss=0.00178] 


Average training loss: 0.0452


Evaluating: 100%|██████████| 483/483 [00:58<00:00,  8.22it/s]


Validation Results:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      3373
           1       0.99      0.98      0.99      5232
           2       0.99      0.96      0.98      6823

    accuracy                           0.97     15428
   macro avg       0.97      0.98      0.97     15428
weighted avg       0.98      0.97      0.97     15428




