In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import torch
from torch import nn
cwd = os.getcwd()
kaggle = cwd == "/kaggle/working"

input_path = "/kaggle/input/bitcointalk/" if kaggle else "datasets/"

feature = "hardware_name"

dataset = pd.read_parquet(input_path + feature + ".parquet")
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

#small dataset for testing code
# train = train[:100]
# test = test[:100]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [11]:
# for row in train.itertuples():
#     text = row.text
#     label = row.label
#     print(label)
#     print(text)

In [12]:
train["label"].sum()/len(train)

0.5771658578887582

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'distilbert-base-cased-distilled-squad'

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config = model.config
tokenizer.model_max_length = config.max_position_embeddings


print("model parameters:" + str(sum(p.numel() for p in model.parameters())))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased-distilled-squad and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model parameters:65782273


In [14]:
from torch.utils.data import Dataset
import torch
import numpy as np

max_length = 128

def encode_texts(tokenizer, texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)

train_x, train_attention_mask = encode_texts(tokenizer, train['text'])
train_y = torch.tensor(np.array(train['label'].tolist()), dtype=torch.float32)

test_x, test_attention_mask = encode_texts(tokenizer, test['text'])
test_y = torch.tensor(np.array(test['label'].tolist()), dtype=torch.float32)

warmup_x, warmup_attention_mask = encode_texts(tokenizer, train['text'][:100])
warmup_y = torch.tensor(np.array(train['label'].tolist())[:100], dtype=torch.float32)



class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.label = label

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'label': self.label[idx],
        }


train_dataset = CustomDataset(train_x, train_attention_mask, train_y)
val_dataset = CustomDataset(test_x, test_attention_mask, test_y)
warmup_dataset = CustomDataset(warmup_x, warmup_attention_mask, warmup_y)





In [15]:
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F

model = model.to(device)
optimizer = None
criterion = BCEWithLogitsLoss()

# Training function
def train(model, lr_per_epoch, train_dataset, val_dataset):
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4)

    for epoch in range(len(lr_per_epoch)):
        model.train()
        lr = lr_per_epoch[epoch]
        optimizer.param_groups[0]['lr'] = lr
        train_loss = 0.0

        # Training loop with tqdm
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(inputs, attention_mask=attention_mask)
            logits = outputs.logits.squeeze(-1)  # Remove the last dimension

            loss = criterion(logits, labels)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()
            
            train_loss += loss.item()
            
        avg_train_loss = train_loss / len(train_loader)
        
        if val_dataset is None:
            print(f"Train Loss: {avg_train_loss}")
            continue
        # Validation loop
        model.eval()
        val_loss = 0.0
        all_predictions_raw = []
        all_labels = []

        # Validation loop with tqdm
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            with torch.no_grad():
                outputs = model(inputs, attention_mask=attention_mask, labels=labels)
                logits = outputs.logits.squeeze(-1)
                val_loss += criterion(logits, labels).item()
                pred = F.sigmoid(logits)
                
                all_predictions_raw.append(pred.cpu())
                all_labels.append(labels.cpu())

        avg_val_loss = val_loss / len(val_loader)
        

        all_predictions_raw = torch.cat(all_predictions_raw)
        all_labels = torch.cat(all_labels)
        accuracy = accuracy_score(all_labels.numpy() >= 0.5, all_predictions_raw.numpy() >= 0.5)
        f1 = f1_score(all_labels.numpy() >= 0.5, all_predictions_raw.numpy() >= 0.5, average='macro')
        print(f"Train Loss: {avg_train_loss}, Val Loss: {avg_val_loss}, Accuracy: {accuracy}, f1: {f1}")

        # print(f"First predictions:")
        # i = 0
        # for y_pred, y in zip(all_predictions_raw, all_labels):
        #     print(f"y_pred: {y_pred.item()}, y: {y}")
        #     i += 1
        #     if i > 5:
        #         break

        print("\n")

# Train the model
for param in model.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True
for param in model.pre_classifier.parameters():
    param.requires_grad = True
for i in range(-1, 0):
    # for param in model.roberta.encoder.layer[i].parameters():
    # for param in model.bert.encoder.layer[i].parameters():
    for param in model.distilbert.transformer.layer[i].parameters():
    # for param in model.deberta.encoder.layer[i].parameters():
        param.requires_grad = True
optimizer = AdamW([param for param in model.parameters() if param.requires_grad], lr=1e-5)

train(model, [1e-9], warmup_dataset, val_dataset)
train(model, [1e-5, 1e-5, 1e-5, 1e-5, 2e-6, 1e-6], train_dataset, val_dataset)

Training Epoch 1: 100%|██████████| 4/4 [00:00<00:00,  6.50it/s]
Validation Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 10.20it/s]


Train Loss: 0.6844824850559235, Val Loss: 0.7311649918556213, Accuracy: 0.25, f1: 0.2




Training Epoch 1: 100%|██████████| 4/4 [00:00<00:00,  6.75it/s]
Validation Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 10.53it/s]


Train Loss: 0.6855549961328506, Val Loss: 0.73581463098526, Accuracy: 0.25, f1: 0.2




Training Epoch 2: 100%|██████████| 4/4 [00:00<00:00,  6.74it/s]
Validation Epoch 2: 100%|██████████| 1/1 [00:00<00:00, 10.64it/s]


Train Loss: 0.6719101965427399, Val Loss: 0.7384967803955078, Accuracy: 0.25, f1: 0.2




Training Epoch 3: 100%|██████████| 4/4 [00:00<00:00,  6.61it/s]
Validation Epoch 3: 100%|██████████| 1/1 [00:00<00:00, 10.64it/s]


Train Loss: 0.680213212966919, Val Loss: 0.7415855526924133, Accuracy: 0.25, f1: 0.2




Training Epoch 4: 100%|██████████| 4/4 [00:00<00:00,  6.79it/s]
Validation Epoch 4: 100%|██████████| 1/1 [00:00<00:00, 10.64it/s]


Train Loss: 0.6760940700769424, Val Loss: 0.7472208738327026, Accuracy: 0.25, f1: 0.2




Training Epoch 5: 100%|██████████| 4/4 [00:00<00:00,  6.78it/s]
Validation Epoch 5: 100%|██████████| 1/1 [00:00<00:00, 10.87it/s]


Train Loss: 0.6705255657434464, Val Loss: 0.7470955848693848, Accuracy: 0.25, f1: 0.2




Training Epoch 6: 100%|██████████| 4/4 [00:00<00:00,  6.78it/s]
Validation Epoch 6: 100%|██████████| 1/1 [00:00<00:00, 10.53it/s]

Train Loss: 0.6599228382110596, Val Loss: 0.7472692728042603, Accuracy: 0.25, f1: 0.2







In [16]:
# Save the model
output_path = "/kaggle/working/" if kaggle else "models/"
output_path += feature + "/"
if not os.path.exists(output_path):
    os.makedirs(output_path)
torch.save(model, output_path + "model.pt")
torch.save(tokenizer, output_path + "tokenizer.pt")
print("saved")

saved
