In [1]:
!pip install torch pandas numpy transformers scikit-learn tqdm torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import random
from tqdm import tqdm

# Set seed for reproducibility
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [3]:
# Load data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Quick data exploration
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Label distribution: {train_df['label'].value_counts()}")

# Text cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@user', '', text)  # Remove user mentions
    text = re.sub(r'[^\w\s#@]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply cleaning
train_df['cleaned_tweet'] = train_df['tweet'].apply(clean_text)
test_df['cleaned_tweet'] = test_df['tweet'].apply(clean_text)

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['cleaned_tweet'].values,
    train_df['label'].values,
    test_size=0.1,
    random_state=42,
    stratify=train_df['label'].values
)

print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")


Training data shape: (31962, 3)
Test data shape: (17197, 2)
Label distribution: label
0    29720
1     2242
Name: count, dtype: int64
Training set size: 28765
Validation set size: 3197


In [4]:
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Custom dataset class
class TweetDataset(Dataset):
    def __init__(self, texts, labels=None, max_length=128):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

# Create datasets
train_dataset = TweetDataset(train_texts, train_labels)
val_dataset = TweetDataset(val_texts, val_labels)
test_dataset = TweetDataset(test_df['cleaned_tweet'].values)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [5]:
# Function to calculate batch size based on available GPU memory
def get_batch_size():
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9  # Convert to GB
        if gpu_mem > 16:  # High-end GPU
            return 32
        elif gpu_mem > 8:  # Mid-range GPU
            return 16
        else:  # Low-end GPU
            return 8
    else:
        return 8  # Default for CPU

batch_size = get_batch_size()
print(f"Using batch size: {batch_size}")

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


Using batch size: 16


In [6]:
# Model training function
def train_eval_model(model, train_loader, val_loader, config):
    optimizer = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    total_steps = len(train_loader) * config['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_val_loss = float('inf')
    best_model_state = None

    for epoch in range(config['epochs']):
        print(f"\nEpoch {epoch+1}/{config['epochs']}")

        # Training phase
        model.train()
        total_train_loss = 0
        progress_bar = tqdm(train_loader, desc="Training")

        for batch in progress_bar:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            model.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Save best model
        if avg_train_loss < best_val_loss:
            best_val_loss = avg_train_loss
            best_model_state = model.state_dict().copy()
            print("New best model saved!")

    return best_model_state

# Hyperparameter configuration
config = {'learning_rate': 2e-5, 'weight_decay': 0.01, 'epochs': 3}


In [7]:
# Initialize model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)

# Train and save best model
best_model_state = train_eval_model(model, train_loader, val_loader, config)

# Save best model
torch.save(best_model_state, 'best_distilbert_model.pt')
print("Best model saved to 'best_distilbert_model.pt'")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 1798/1798 [05:29<00:00,  5.46it/s, loss=0.0967]


Average training loss: 0.1338
New best model saved!

Epoch 2/3


Training: 100%|██████████| 1798/1798 [05:30<00:00,  5.44it/s, loss=0.00206]


Average training loss: 0.0739
New best model saved!

Epoch 3/3


Training: 100%|██████████| 1798/1798 [05:30<00:00,  5.44it/s, loss=0.000361]


Average training loss: 0.0342
New best model saved!
Best model saved to 'best_distilbert_model.pt'


In [9]:
# Load trained model
best_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)
best_model.load_state_dict(torch.load('best_distilbert_model.pt'))

# Make predictions on test set
test_predictions = []
test_ids = test_df['id'].values
tweet=test_df['tweet'].values

best_model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting on test set"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = best_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        test_predictions.extend(preds)

# Create submission DataFrame
submission_df = pd.DataFrame({'id': test_ids, 'Tweet':tweet,'label': test_predictions})
submission_df.to_csv('distilbert_predictions.csv', index=False)
print("Predictions saved to 'distilbert_predictions.csv'")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting on test set: 100%|██████████| 1075/1075 [01:07<00:00, 15.99it/s]


Predictions saved to 'distilbert_predictions_1.csv'
