In [None]:

from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
class CodeDataset(Dataset):
    def __init__(self, data, use_normalized=True):
        self.data = data
        self.use_normalized = use_normalized

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        code_field = 'normalized_code' if self.use_normalized else 'code'
        code = self.data[idx][code_field]
        label = self.data[idx]['label']

        encoded = tokenizer(
            code,
            padding='max_length',
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
df = pd.read_csv('/content/drive/My Drive/balanced_dataset.csv')

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


train_data_raw = train_df.to_dict(orient='records')
val_data_raw = val_df.to_dict(orient='records')


train_dataset = CodeDataset(train_data_raw, use_normalized=True)
val_dataset = CodeDataset(val_data_raw, use_normalized=True)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
# train_data_raw = df.to_dict(orient='records')
# train_data = CodeDataset(train_data_raw, use_normalized=True)
# train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
# train_data_raw

In [None]:
# model_path = r"C:\Users\li\.cache\huggingface\hub\microsoft-codebert-base"
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
class CodeBERTBinaryClassifier(nn.Module):
    def __init__(self, model_path):
        super(CodeBERTBinaryClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("microsoft/codebert-base")
        for param in self.bert.parameters():  # Freeze the first 6 layers
          param.requires_grad = False
        # for name, param in self.bert.named_parameters():
        #   if any(name.startswith(f'encoder.layer.{i}') for i in range(6)):
        #       param.requires_grad = False
        #   else:
        #       param.requires_grad = True
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)  # 二分类

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # 取 [CLS] token 对应的表示
        logits = self.classifier(pooled_output)
        return logits


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CodeBERTBinaryClassifier("microsoft/codebert-base").to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
# def train_one_epoch(model, data_loader, optimizer, loss_fn, device):
#     model.train()
#     total_loss = 0
#     all_preds, all_labels = [], []

#     for batch in data_loader:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         optimizer.zero_grad()
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         loss = loss_fn(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()
#         preds = torch.argmax(outputs, dim=1)
#         all_preds.extend(preds.cpu().tolist())
#         all_labels.extend(labels.cpu().tolist())

#     acc = accuracy_score(all_labels, all_preds)
#     avg_loss = total_loss / len(data_loader)

#     return avg_loss, acc


In [None]:
from sklearn.metrics import f1_score, accuracy_score
import torch
from tqdm import tqdm

def train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    criterion,
    device="cpu",
    epochs=5,
    early_stopping=2
):
    best_f1 = 0
    no_improve = 0
    history = {'train_loss': [], 'val_loss': [], 'train_f1': [], 'val_f1': []}
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        all_preds = []
        all_labels = []

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            progress_bar.set_postfix({'loss': loss.item()})

        # Validation
        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        # Metrics
        train_loss_avg = train_loss / len(train_loader)
        val_loss_avg = val_loss / len(val_loader)
        train_f1 = f1_score(all_labels, all_preds, average='binary')
        val_f1 = f1_score(val_labels, val_preds, average='binary')
        val_acc = accuracy_score(val_labels, val_preds)

        # Record
        history['train_loss'].append(train_loss_avg)
        history['val_loss'].append(val_loss_avg)
        history['train_f1'].append(train_f1)
        history['val_f1'].append(val_f1)

        print(f"\nEpoch {epoch+1}:")
        print(f"  Train Loss: {train_loss_avg:.4f} | Train F1: {train_f1:.4f}")
        print(f"  Val Loss:   {val_loss_avg:.4f} | Val F1:   {val_f1:.4f} | Val Acc: {val_acc:.4f}")

        # Early stopping logic
        if val_f1 > best_f1:
            best_f1 = val_f1
            no_improve = 0
            torch.save(model.state_dict(), "best_model.pth")
            print("  ↳ Saved new best model!")
        else:
            no_improve += 1
            if no_improve >= early_stopping:
                print(f"Early stopping at epoch {epoch+1}")
                break

    # Load best model
    model.load_state_dict(torch.load("best_model.pth"))
    return model, history


In [None]:
# EPOCHS = 3
# for epoch in range(EPOCHS):
#     loss, acc = train_model(model, train_loader, optimizer, loss_fn, device)
#     print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss:.4f} - Accuracy: {acc:.4f}")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
trained_model, history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    epochs=10,
    early_stopping=2
)


Epoch 1/10:   0%|          | 0/95 [00:00<?, ?it/s][A
Epoch 1/10:   0%|          | 0/95 [00:01<?, ?it/s, loss=0.697][A
Epoch 1/10:   1%|          | 1/95 [00:01<02:56,  1.88s/it, loss=0.697][A
Epoch 1/10:   1%|          | 1/95 [00:02<02:56,  1.88s/it, loss=0.672][A
Epoch 1/10:   2%|▏         | 2/95 [00:02<01:22,  1.13it/s, loss=0.672][A
Epoch 1/10:   2%|▏         | 2/95 [00:02<01:22,  1.13it/s, loss=0.677][A
Epoch 1/10:   3%|▎         | 3/95 [00:02<00:52,  1.76it/s, loss=0.677][A
Epoch 1/10:   3%|▎         | 3/95 [00:02<00:52,  1.76it/s, loss=0.71] [A
Epoch 1/10:   4%|▍         | 4/95 [00:02<00:38,  2.38it/s, loss=0.71][A
Epoch 1/10:   4%|▍         | 4/95 [00:02<00:38,  2.38it/s, loss=0.666][A
Epoch 1/10:   5%|▌         | 5/95 [00:02<00:30,  2.95it/s, loss=0.666][A
Epoch 1/10:   5%|▌         | 5/95 [00:02<00:30,  2.95it/s, loss=0.634][A
Epoch 1/10:   6%|▋         | 6/95 [00:02<00:25,  3.43it/s, loss=0.634][A
Epoch 1/10:   6%|▋         | 6/95 [00:03<00:25,  3.43it/s, loss=0.


Epoch 1:
  Train Loss: 0.6757 | Train F1: 0.5405
  Val Loss:   0.6677 | Val F1:   0.5751 | Val Acc: 0.6500
  ↳ Saved new best model!


Epoch 2/10: 100%|██████████| 95/95 [00:19<00:00,  4.90it/s, loss=0.694]



Epoch 2:
  Train Loss: 0.6705 | Train F1: 0.6290
  Val Loss:   0.6578 | Val F1:   0.6589 | Val Acc: 0.6921
  ↳ Saved new best model!


Epoch 3/10: 100%|██████████| 95/95 [00:20<00:00,  4.71it/s, loss=0.641]



Epoch 3:
  Train Loss: 0.6558 | Train F1: 0.6830
  Val Loss:   0.6496 | Val F1:   0.6821 | Val Acc: 0.7105
  ↳ Saved new best model!


Epoch 4/10: 100%|██████████| 95/95 [00:20<00:00,  4.53it/s, loss=0.591]



Epoch 4:
  Train Loss: 0.6538 | Train F1: 0.6824
  Val Loss:   0.6426 | Val F1:   0.6825 | Val Acc: 0.7184
  ↳ Saved new best model!


Epoch 5/10: 100%|██████████| 95/95 [00:22<00:00,  4.29it/s, loss=0.655]



Epoch 5:
  Train Loss: 0.6473 | Train F1: 0.6901
  Val Loss:   0.6348 | Val F1:   0.7021 | Val Acc: 0.7342
  ↳ Saved new best model!


Epoch 6/10: 100%|██████████| 95/95 [00:21<00:00,  4.46it/s, loss=0.585]



Epoch 6:
  Train Loss: 0.6393 | Train F1: 0.7099
  Val Loss:   0.6272 | Val F1:   0.7122 | Val Acc: 0.7447
  ↳ Saved new best model!


Epoch 7/10: 100%|██████████| 95/95 [00:21<00:00,  4.47it/s, loss=0.628]



Epoch 7:
  Train Loss: 0.6311 | Train F1: 0.7358
  Val Loss:   0.6199 | Val F1:   0.7104 | Val Acc: 0.7447


Epoch 8/10: 100%|██████████| 95/95 [00:21<00:00,  4.39it/s, loss=0.658]



Epoch 8:
  Train Loss: 0.6262 | Train F1: 0.7404
  Val Loss:   0.6114 | Val F1:   0.7356 | Val Acc: 0.7579
  ↳ Saved new best model!


Epoch 9/10: 100%|██████████| 95/95 [00:21<00:00,  4.45it/s, loss=0.588]



Epoch 9:
  Train Loss: 0.6197 | Train F1: 0.7368
  Val Loss:   0.6046 | Val F1:   0.7378 | Val Acc: 0.7605
  ↳ Saved new best model!


Epoch 10/10: 100%|██████████| 95/95 [00:21<00:00,  4.45it/s, loss=0.59]



Epoch 10:
  Train Loss: 0.6174 | Train F1: 0.7463
  Val Loss:   0.6000 | Val F1:   0.7108 | Val Acc: 0.7474


In [None]:
# 设置训练的超参数
EPOCHS = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# 训练并验证
train_model(model, train_loader, val_loader, optimizer, loss_fn, EPOCHS, device)


NameError: name 'val_loader' is not defined