In [1]:
# 1. Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import re
from tqdm.auto import tqdm
import random
import os

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

# 2. Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 3. Load Data
train_file = "/kaggle/input/c-ours/C_Ours/data_C_Ours_train.csv"
test_file = "/kaggle/input/c-ours/C_Ours/data_C_Ours_test.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# 4. Clean Code
def clean_code(code):
    code = re.sub(r'//.*?(\n|$)', ' ', code)  # Single-line comments
    code = re.sub(r'/\*.*?\*/', ' ', code, flags=re.DOTALL)  # Multi-line comments
    code = re.sub(r'\s+', ' ', code.strip())  # Normalize whitespace
    return code

train_df['code'] = train_df['code'].apply(clean_code)
test_df['code'] = test_df['code'].apply(clean_code)

# 5. CodeBERT Tokenizer - CodeBERT is based on RoBERTa architecture
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
max_len = 512  # Maximum length for input

# 6. Dataset with BERT tokenization 
class CodeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.codes = dataframe['code'].tolist()
        self.labels = dataframe['target'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.codes)

    def __getitem__(self, idx):
        code = self.codes[idx]
        label = self.labels[idx]
        
        # Tokenize the code with RoBERTa/CodeBERT tokenizer
        encoding = self.tokenizer.encode_plus(
            code,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# 7. Create Dataset and DataLoader
train_dataset = CodeDataset(train_df, tokenizer, max_len)
test_dataset = CodeDataset(test_df, tokenizer, max_len)

batch_size = 16  # Reduced batch size due to larger model

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 8. LSTM+CodeBERT Model
class BERT_LSTM_Classifier(nn.Module):
    def __init__(self, model_name='microsoft/codebert-base', hidden_dim=256, num_layers=2, dropout=0.3):
        super(BERT_LSTM_Classifier, self).__init__()
        self.bert = RobertaModel.from_pretrained(model_name)
        self.bert_dim = self.bert.config.hidden_size
        
        # Bidirectional LSTM on top of BERT
        self.lstm = nn.LSTM(
            input_size=self.bert_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # MLP head for classification
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)  # * 2 for bidirectional
        self.activation = nn.ReLU()
        self.layernorm = nn.LayerNorm(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, input_ids, attention_mask):
        # Get RoBERTa/CodeBERT embeddings
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # Get sequence output from BERT
        sequence_output = bert_output.last_hidden_state  # [batch_size, seq_len, bert_dim]
        
        # Pass through LSTM
        lstm_output, (hidden, _) = self.lstm(sequence_output)
        
        # Concatenate the final hidden states from both directions
        # hidden shape: [num_layers * num_directions, batch_size, hidden_dim]
        # We want the last layer's hidden state from both directions
        hidden_fwd = hidden[-2, :, :]  # Forward direction from last layer
        hidden_bwd = hidden[-1, :, :]  # Backward direction from last layer
        hidden_cat = torch.cat((hidden_fwd, hidden_bwd), dim=1)  # [batch_size, hidden_dim*2]
        
        # MLP classifier
        out = self.dropout(hidden_cat)
        out = self.fc1(out)
        out = self.activation(out)
        out = self.layernorm(out)
        out = self.dropout(out)
        out = self.fc2(out)
        
        return out

# Initialize model
model = BERT_LSTM_Classifier().to(device)

# 9. Loss and Optimizer
# Use class weights to handle imbalanced data
def calculate_class_weights(labels):
    class_counts = np.bincount(labels.astype(int))
    total = len(labels)
    weights = total / (len(class_counts) * class_counts)
    return torch.tensor(weights, dtype=torch.float)

class_weights = calculate_class_weights(train_df['target'].values)
print(f"Class weights: {class_weights}")

criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights[1].to(device))

# Different learning rates for BERT and other layers
bert_params = list(model.bert.parameters())
other_params = list(model.lstm.parameters()) + list(model.fc1.parameters()) + list(model.fc2.parameters())

optimizer = AdamW([
    {'params': bert_params, 'lr': 2e-5},  # Lower learning rate for BERT parameters
    {'params': other_params, 'lr': 1e-4}   # Higher learning rate for other parameters
])

# Learning rate scheduler
total_steps = len(train_loader) * 10  # epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1 * total_steps,  # 10% of total steps for warmup
    num_training_steps=total_steps
)

# 10. Training Function
def train(model, loader, optimizer, criterion, scheduler):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    
    for batch in tqdm(loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        
        running_loss += loss.item()
        
        # Save predictions and labels for epoch-level metrics
        preds = torch.sigmoid(outputs).detach().cpu().numpy() >= 0.5
        all_preds.extend(preds.astype(int))
        all_labels.extend(labels.cpu().numpy().astype(int))
    
    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_report = classification_report(all_labels, all_preds, digits=4)
    
    return running_loss / len(loader), epoch_acc, epoch_report

# 11. Evaluation Function
def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    running_loss = 0.0
    
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask).squeeze(1)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            
            preds = torch.sigmoid(outputs).cpu().numpy() >= 0.5
            all_preds.extend(preds.astype(int))
            all_labels.extend(labels.cpu().numpy().astype(int))
    
    acc = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, digits=4)
    cm = confusion_matrix(all_labels, all_preds)
    
    return running_loss / len(loader), acc, report, cm

# 12. Training Loop with Early Stopping
epochs = 7
best_acc = 0
patience = 3
no_improve_epochs = 0

# Create directory for saving models
os.makedirs('models', exist_ok=True)

print("Starting training...")
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    
    train_loss, train_acc, train_report = train(model, train_loader, optimizer, criterion, scheduler)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"Train Classification Report:\n{train_report}")
    
    val_loss, val_acc, val_report, val_cm = evaluate(model, test_loader)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
    print(f"Validation Classification Report:\n{val_report}")
    print(f"Confusion Matrix:\n{val_cm}")
    
    # Save the best model
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), 'models/best_model.pt')
        print(f"New best model saved with accuracy: {best_acc:.4f}")
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        print(f"No improvement for {no_improve_epochs} epochs")
    
    # Early stopping
    if no_improve_epochs >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs")
        break

print(f"Training completed. Best validation accuracy: {best_acc:.4f}")

# 13. Load the best model and evaluate on test set
model.load_state_dict(torch.load('models/best_model.pt'))
test_loss, test_acc, test_report, test_cm = evaluate(model, test_loader)
print(f"\nFinal Test Results:")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
print(f"Test Classification Report:\n{test_report}")
print(f"Test Confusion Matrix:\n{test_cm}")

# 14. Feature Importance Analysis (Optional)
def get_attention_visualization(model, dataset, sample_idx=0):
    """
    Get attention scores for a specific code sample to see what the model focuses on
    """
    model.eval()
    sample = dataset[sample_idx]
    
    input_ids = sample['input_ids'].unsqueeze(0).to(device)
    attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_attentions=True
        )
        
    # Get attention weights from the last layer
    attentions = outputs.attentions[-1].cpu().numpy()
    
    # Get the tokens for visualization
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
    
    # Return tokens and attention matrix
    return tokens, attentions

# Example of how to use the feature importance analysis
sample_idx = 0  # Choose a sample
tokens, attentions = get_attention_visualization(model, test_dataset, sample_idx)
print(f"Sample code has {len(tokens)} tokens")
print(f"First 20 tokens: {tokens[:20]}")
print(f"Attention matrix shape: {attentions.shape}")

2025-04-30 04:29:15.769770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745987356.235325      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745987356.361969      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Class weights: tensor([1., 1.])
Starting training...

Epoch 1/7


  0%|          | 0/677 [00:00<?, ?it/s]

Train Loss: 0.5454, Train Accuracy: 0.7194
Train Classification Report:
              precision    recall  f1-score   support

           0     0.7087    0.7449    0.7264      5413
           1     0.7312    0.6939    0.7120      5413

    accuracy                         0.7194     10826
   macro avg     0.7200    0.7194    0.7192     10826
weighted avg     0.7200    0.7194    0.7192     10826



  0%|          | 0/170 [00:00<?, ?it/s]

Validation Loss: 0.4049, Validation Accuracy: 0.8356
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.7671    0.9638    0.8542      1353
           1     0.9513    0.7073    0.8114      1353

    accuracy                         0.8356      2706
   macro avg     0.8592    0.8356    0.8328      2706
weighted avg     0.8592    0.8356    0.8328      2706

Confusion Matrix:
[[1304   49]
 [ 396  957]]
New best model saved with accuracy: 0.8356

Epoch 2/7


  0%|          | 0/677 [00:00<?, ?it/s]

Train Loss: 0.2861, Train Accuracy: 0.8979
Train Classification Report:
              precision    recall  f1-score   support

           0     0.8812    0.9198    0.9001      5413
           1     0.9162    0.8760    0.8956      5413

    accuracy                         0.8979     10826
   macro avg     0.8987    0.8979    0.8979     10826
weighted avg     0.8987    0.8979    0.8979     10826



  0%|          | 0/170 [00:00<?, ?it/s]

Validation Loss: 0.2695, Validation Accuracy: 0.9043
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.9404    0.8633    0.9002      1353
           1     0.8736    0.9453    0.9081      1353

    accuracy                         0.9043      2706
   macro avg     0.9070    0.9043    0.9041      2706
weighted avg     0.9070    0.9043    0.9041      2706

Confusion Matrix:
[[1168  185]
 [  74 1279]]
New best model saved with accuracy: 0.9043

Epoch 3/7


  0%|          | 0/677 [00:00<?, ?it/s]

Train Loss: 0.1822, Train Accuracy: 0.9458
Train Classification Report:
              precision    recall  f1-score   support

           0     0.9332    0.9603    0.9466      5413
           1     0.9591    0.9313    0.9450      5413

    accuracy                         0.9458     10826
   macro avg     0.9462    0.9458    0.9458     10826
weighted avg     0.9462    0.9458    0.9458     10826



  0%|          | 0/170 [00:00<?, ?it/s]

Validation Loss: 0.2356, Validation Accuracy: 0.9276
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.8849    0.9830    0.9314      1353
           1     0.9809    0.8721    0.9233      1353

    accuracy                         0.9276      2706
   macro avg     0.9329    0.9276    0.9273      2706
weighted avg     0.9329    0.9276    0.9273      2706

Confusion Matrix:
[[1330   23]
 [ 173 1180]]
New best model saved with accuracy: 0.9276

Epoch 4/7


  0%|          | 0/677 [00:00<?, ?it/s]

Train Loss: 0.1192, Train Accuracy: 0.9678
Train Classification Report:
              precision    recall  f1-score   support

           0     0.9575    0.9789    0.9681      5413
           1     0.9785    0.9566    0.9674      5413

    accuracy                         0.9678     10826
   macro avg     0.9680    0.9678    0.9678     10826
weighted avg     0.9680    0.9678    0.9678     10826



  0%|          | 0/170 [00:00<?, ?it/s]

Validation Loss: 0.2325, Validation Accuracy: 0.9464
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.9370    0.9571    0.9470      1353
           1     0.9562    0.9357    0.9458      1353

    accuracy                         0.9464      2706
   macro avg     0.9466    0.9464    0.9464      2706
weighted avg     0.9466    0.9464    0.9464      2706

Confusion Matrix:
[[1295   58]
 [  87 1266]]
New best model saved with accuracy: 0.9464

Epoch 5/7


  0%|          | 0/677 [00:00<?, ?it/s]

Train Loss: 0.0796, Train Accuracy: 0.9800
Train Classification Report:
              precision    recall  f1-score   support

           0     0.9743    0.9861    0.9802      5413
           1     0.9860    0.9740    0.9799      5413

    accuracy                         0.9800     10826
   macro avg     0.9801    0.9800    0.9800     10826
weighted avg     0.9801    0.9800    0.9800     10826



  0%|          | 0/170 [00:00<?, ?it/s]

Validation Loss: 0.2188, Validation Accuracy: 0.9446
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.9337    0.9571    0.9453      1353
           1     0.9560    0.9320    0.9439      1353

    accuracy                         0.9446      2706
   macro avg     0.9448    0.9446    0.9446      2706
weighted avg     0.9448    0.9446    0.9446      2706

Confusion Matrix:
[[1295   58]
 [  92 1261]]
No improvement for 1 epochs

Epoch 6/7


  0%|          | 0/677 [00:00<?, ?it/s]

Train Loss: 0.0591, Train Accuracy: 0.9859
Train Classification Report:
              precision    recall  f1-score   support

           0     0.9840    0.9878    0.9859      5413
           1     0.9878    0.9839    0.9858      5413

    accuracy                         0.9859     10826
   macro avg     0.9859    0.9859    0.9859     10826
weighted avg     0.9859    0.9859    0.9859     10826



  0%|          | 0/170 [00:00<?, ?it/s]

Validation Loss: 0.2432, Validation Accuracy: 0.9501
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.9381    0.9638    0.9508      1353
           1     0.9628    0.9364    0.9494      1353

    accuracy                         0.9501      2706
   macro avg     0.9504    0.9501    0.9501      2706
weighted avg     0.9504    0.9501    0.9501      2706

Confusion Matrix:
[[1304   49]
 [  86 1267]]
New best model saved with accuracy: 0.9501

Epoch 7/7


  0%|          | 0/677 [00:00<?, ?it/s]

Train Loss: 0.0364, Train Accuracy: 0.9913
Train Classification Report:
              precision    recall  f1-score   support

           0     0.9902    0.9924    0.9913      5413
           1     0.9924    0.9902    0.9913      5413

    accuracy                         0.9913     10826
   macro avg     0.9913    0.9913    0.9913     10826
weighted avg     0.9913    0.9913    0.9913     10826



  0%|          | 0/170 [00:00<?, ?it/s]

Validation Loss: 0.3253, Validation Accuracy: 0.9464
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.9597    0.9320    0.9456      1353
           1     0.9339    0.9608    0.9472      1353

    accuracy                         0.9464      2706
   macro avg     0.9468    0.9464    0.9464      2706
weighted avg     0.9468    0.9464    0.9464      2706

Confusion Matrix:
[[1261   92]
 [  53 1300]]
No improvement for 1 epochs
Training completed. Best validation accuracy: 0.9501


  model.load_state_dict(torch.load('models/best_model.pt'))


  0%|          | 0/170 [00:00<?, ?it/s]




Final Test Results:
Test Loss: 0.2432, Test Accuracy: 0.9501
Test Classification Report:
              precision    recall  f1-score   support

           0     0.9381    0.9638    0.9508      1353
           1     0.9628    0.9364    0.9494      1353

    accuracy                         0.9501      2706
   macro avg     0.9504    0.9501    0.9501      2706
weighted avg     0.9504    0.9501    0.9501      2706

Test Confusion Matrix:
[[1304   49]
 [  86 1267]]
Sample code has 512 tokens
First 20 tokens: ['<s>', 'SY', 'SC', 'ALL', '_', 'DE', 'FINE', '3', '(', 'os', 'f', '_', 'sys', 'info', ',', 'Ġint', ',', 'Ġcommand', ',', 'Ġchar']
Attention matrix shape: (1, 12, 512, 512)
