In [1]:
#!pip install -q pyyaml h5py
#!pip install scikit-plot
#!pip install comet_ml
#!pip install tensorflow
#!pip install pytorch

## libraries

In [2]:
# Import libraries
import numpy as np
import os
import pandas as pd
import random
import torch
import torch.nn as nn
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Import matplotlib
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [3]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


## load and split dataset

In [4]:
# Load dataset from Hugging Face
dataset = load_dataset("turne292/tldr_sentiment_data_small")
data = dataset["train"]
# Split the dataset into train and validation sets
train_dataset, val_dataset = train_test_split(data, test_size=0.2)

Downloading and preparing dataset csv/turne292--tldr_sentiment_data_small to C:/Users/danda/.cache/huggingface/datasets/turne292___csv/turne292--tldr_sentiment_data_small-e23c9f0425001bbe/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.81M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/danda/.cache/huggingface/datasets/turne292___csv/turne292--tldr_sentiment_data_small-e23c9f0425001bbe/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
#train data load
X_train = train_dataset['text']
X_valid = val_dataset['text']
y_train = train_dataset['label']
y_valid = val_dataset['label']

In [6]:
# Load test data
test = dataset["test"]
X_test = test['text']
y_test = test['label']


In [7]:
X_train[55]

" r/AskReddit\nTITLE: So much love..... but..... reddit help?\nPOST: Alright,\n\nSo here is the thing. I am an over-emotional guy who has a lot of love to give. I'm just an idiot. I enjoy relationships, but generally only have a relationship once a year (approx.) I really love being with girls, I love everything about the ones that mean something to me. I am very optimistic and very open minded. I have just... hit a snag.\n\nI have NO idea what to do here.\n\nAbout 6 months ago I contracted an STI. Now it'"

In [8]:
y_train[55]

'positive'

In [9]:
X_valid[55]

' r/relationships\nTITLE: Do you tell your best guy/girl friend that you love them? F(21)\nPOST: Last night when my friends and I went out to the bar. Me and my girl friend dressed up. My best guy friend (friends 4 years) was acting kind of awkward like he always does. He kept teasing me about what I was wearing. He kept pushing into me in a flirty way. Before we left he said "I could never get sick of you, ever" and while we were out he put his arm around me and said "I love you. Really, I do." He t'

In [10]:
y_test[55]

'negative'

## set parameters used for training

In [11]:
# Set parameters:
params = {'MAX_LENGTH': 128,
          'EPOCHS': 6,
          'LEARNING_RATE': 5e-5,
          'FT_EPOCHS': 2,
          'OPTIMIZER': 'adam',
          'FL_GAMMA': 2.0,
          'FL_ALPHA': 0.2,
          'BATCH_SIZE': 16,
          'NUM_STEPS': len(train_dataset) // 64,
          'DISTILBERT_DROPOUT': 0.2,
          'DISTILBERT_ATT_DROPOUT': 0.2,
          'LAYER_DROPOUT': 0.2,
          'KERNEL_INITIALIZER': 'GlorotNormal',
          'BIAS_INITIALIZER': 'zeros',
          'POS_PROBA_THRESHOLD': 0.5,          
          'ADDED_LAYERS': 'Dense 256, Dense 32, Dropout 0.2',
          'LR_SCHEDULE': '5e-5 for 6 epochs, Fine-tune w/ adam for 2 epochs @2e-5',
          'FREEZING': 'All DistilBERT layers frozen for 6 epochs, then unfrozen for 2',
          'CALLBACKS': '[early_stopping w/ patience=0]',
          'RANDOM_STATE':42
          }


## Tokenizing

In [13]:
# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and encode the training, validation, and test sequences
X_train_encoded = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
X_valid_encoded = tokenizer(X_valid, padding=True, truncation=True, return_tensors='pt')
X_test_encoded = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)
y_test_encoded = label_encoder.transform(y_test)

# Convert encoded text data and labels to tensors
X_train_ids_tensor = X_train_encoded['input_ids']
X_train_attention_tensor = X_train_encoded['attention_mask']
X_valid_ids_tensor = X_valid_encoded['input_ids']
X_valid_attention_tensor = X_valid_encoded['attention_mask']
X_test_ids_tensor = X_test_encoded['input_ids']
X_test_attention_tensor = X_test_encoded['attention_mask']

y_train_tensor = torch.tensor(y_train_encoded).float()
y_valid_tensor = torch.tensor(y_valid_encoded).float()
y_test_tensor = torch.tensor(y_test_encoded).float()

# Print shapes to verify
print("Shapes:")
print("X_train_ids_tensor:", X_train_ids_tensor.shape)
print("X_train_attention_tensor:", X_train_attention_tensor.shape)
print("X_valid_ids_tensor:", X_valid_ids_tensor.shape)
print("X_valid_attention_tensor:", X_valid_attention_tensor.shape)
print("X_test_ids_tensor:", X_test_ids_tensor.shape)
print("X_test_attention_tensor:", X_test_attention_tensor.shape)
print("y_train_tensor:", y_train_tensor.shape)
print("y_valid_tensor:", y_valid_tensor.shape)
print("y_test_tensor:", y_test_tensor.shape)

Shapes:
X_train_ids_tensor: torch.Size([37881, 212])
X_train_attention_tensor: torch.Size([37881, 212])
X_valid_ids_tensor: torch.Size([9471, 205])
X_valid_attention_tensor: torch.Size([9471, 205])
X_test_ids_tensor: torch.Size([13255, 172])
X_test_attention_tensor: torch.Size([13255, 172])
y_train_tensor: torch.Size([37881])
y_valid_tensor: torch.Size([9471])
y_test_tensor: torch.Size([13255])


## define model

In [14]:
class DistilBERTClassifier(nn.Module):
    def __init__(self, num_labels=1, hidden_size=768, dropout_prob=0.1):
        super(DistilBERTClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(256, 32),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(32, num_labels),
            nn.Sigmoid()  # For binary classification
        )

    def forward(self, input_ids, attention_mask=None):
        with torch.no_grad():
            outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Free up memory by removing unnecessary tensors
        del input_ids
        del attention_mask

        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        pooled_output = self.dropout(pooled_output)
        
        # Free up memory by removing unnecessary tensors
        del outputs

        logits = self.classifier(pooled_output)

        # Free up memory by removing unnecessary tensors
        del pooled_output

        return logits

def build_model(max_length=params['MAX_LENGTH'], num_labels=1):
    model = DistilBERTClassifier(num_labels=num_labels)
    return model

## build model

In [15]:
# Define configuration for DistilBERT model
config = DistilBertConfig(dropout=params['DISTILBERT_DROPOUT'],
                          attention_dropout=params['DISTILBERT_ATT_DROPOUT'],
                          output_hidden_states=True)

# Load the pre-trained DistilBERT model
distilBERT = DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Freeze DistilBERT layers to preserve pre-trained weights 
for param in distilBERT.parameters():
    param.requires_grad = False

# Build model
model = build_model(distilBERT)

## train

In [16]:

# Unfreeze DistilBERT weights to enable fine-tuning (optional)
for param in model.distilbert.parameters():
    param.requires_grad = True

# Lower the learning rate to prevent destruction of pre-trained weights
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Define loss function (replace focal_loss() with your desired loss function)
# Make sure to define your focal_loss() function or replace it with another loss function
loss_function = nn.BCELoss()

# Combine input IDs and attention masks into a tuple for easier handling
train_data = TensorDataset(X_train_ids_tensor, X_train_attention_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=params['BATCH_SIZE'], shuffle=True)

# Validation data
valid_data = TensorDataset(X_valid_ids_tensor, X_valid_attention_tensor, y_valid_tensor)
valid_loader = DataLoader(valid_data, batch_size=params['BATCH_SIZE'])

# Define the training loop
def train_model(model, optimizer, loss_function, train_loader, valid_loader, epochs, device):
    model.train()
    for epoch in range(epochs):
        train_loss = 0.0
        valid_loss = 0.0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_function(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * input_ids.size(0)
        
        # Validate the model
        model.eval()
        with torch.no_grad():
            for batch in valid_loader:
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                outputs = model(input_ids, attention_mask)
                loss = loss_function(outputs.squeeze(), labels)
                valid_loss += loss.item() * input_ids.size(0)
        
        train_loss /= len(train_loader.dataset)
        valid_loss /= len(valid_loader.dataset)
        
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model.to(device)

# Train the model
train_model(model, optimizer, loss_function, train_loader, valid_loader, params['FT_EPOCHS'], device)


Epoch 1/2, Train Loss: 0.6244, Valid Loss: 0.5616
Epoch 2/2, Train Loss: 0.5156, Valid Loss: 0.4974


In [19]:
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

'PYTORCH_CUDA_ALLOC_CONF' is not recognized as an internal or external command,
operable program or batch file.


In [17]:


# Move model to evaluation mode
model.eval()

# Generate predictions
with torch.no_grad():
    y_pred_tensor = model(X_test_ids_tensor.to(device), X_test_attention_tensor.to(device))
    y_pred = y_pred_tensor.cpu().numpy().flatten()
    y_pred_thresh = np.where(y_pred >= params['POS_PROBA_THRESHOLD'], 1, 0)

# Convert y_test_tensor to numpy array
y_test_numpy = y_test_tensor.cpu().numpy()

# Get evaluation results
accuracy = accuracy_score(y_test_numpy, y_pred_thresh)
auc_roc = roc_auc_score(y_test_numpy, y_pred)

# Log evaluation metrics (assuming you have an experiment object for logging)
experiment.log_metrics({'Accuracy': accuracy, 'AUC-ROC': auc_roc})

# Log the ROC curve
fpr, tpr, thresholds = roc_curve(y_test_numpy, y_pred)
experiment.log_curve('ROC curve', fpr.tolist(), tpr.tolist())

print('Accuracy:  ', accuracy)
print('ROC-AUC:   ', auc_roc)


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.52 GiB. GPU 0 has a total capacity of 15.99 GiB of which 0 bytes is free. Of the allocated memory 26.39 GiB is allocated by PyTorch, and 25.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [21]:
# Save model's state dictionary
torch.save(model.state_dict(), "distilbert-tldr-fine-small/pytorch_model.bin")

# Save model's configuration
config.save_pretrained("distilbert-tldr-fine-small")

# Save tokenizer
tokenizer.save_pretrained("distilbert-tldr-fine-small")

('distilbert-tldr-fine-small\\tokenizer_config.json',
 'distilbert-tldr-fine-small\\special_tokens_map.json',
 'distilbert-tldr-fine-small\\vocab.txt',
 'distilbert-tldr-fine-small\\added_tokens.json')