In [5]:
# 1. Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
from tqdm.auto import tqdm
from collections import Counter
import os

# 2. Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 3. Load Data
train_file = "/kaggle/input/c-ours/C_Ours/data_C_Ours_train.csv"
test_file = "/kaggle/input/c-ours/C_Ours/data_C_Ours_test.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# 4. Clean Code
def clean_code(code):
    code = re.sub(r'//.*?(\n|$)', ' ', code)  # Single-line comments
    code = re.sub(r'/\*.*?\*/', ' ', code, flags=re.DOTALL)  # Multi-line comments
    code = re.sub(r'\s+', ' ', code.strip())  # Normalize whitespace
    return code

train_df['code'] = train_df['code'].apply(clean_code)
test_df['code'] = test_df['code'].apply(clean_code)

# 5. Split data for validation
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['target'])

# 6. Character-level tokenizer for LSTM
def build_vocab(codes):
    vocab = set()
    for code in codes:
        vocab.update(list(code))
    vocab = sorted(list(vocab))
    vocab_dict = {char: idx+1 for idx, char in enumerate(vocab)}  # 0 is reserved for padding
    vocab_dict['<PAD>'] = 0
    return vocab_dict

vocab = build_vocab(train_df['code'])

def encode_code(code, vocab, max_len=512):
    encoded = [vocab.get(c, 0) for c in code]
    if len(encoded) > max_len:
        encoded = encoded[:max_len]
    else:
        encoded += [0] * (max_len - len(encoded))
    return encoded

# 7. Word-level tokenizer for transformer approach
def get_word_vocab(codes, max_size=10000):
    """Build a word-level vocabulary from code samples"""
    # Tokenize code into words (considering code tokens)
    all_words = []
    for code in codes:
        # Simple tokenization: split by spaces and separate symbols
        tokens = re.findall(r'\w+|[^\w\s]', code)
        all_words.extend(tokens)
    
    # Count word frequencies
    word_counts = Counter(all_words)
    
    # Create vocabulary dict with most common words
    common_words = word_counts.most_common(max_size-4)  # Leave room for special tokens
    word_vocab = {
        '<PAD>': 0,
        '<UNK>': 1,
        '<SOS>': 2,
        '<EOS>': 3,
    }
    for i, (word, _) in enumerate(common_words):
        word_vocab[word] = i + 4
    
    return word_vocab

def tokenize_code(code, word_vocab, max_len=256):
    """Tokenize code into words and convert to indices"""
    tokens = re.findall(r'\w+|[^\w\s]', code)
    # Convert tokens to indices
    indices = [word_vocab.get(token, word_vocab['<UNK>']) for token in tokens]
    # Truncate or pad to max_len
    if len(indices) > max_len - 2:  # Account for SOS and EOS
        indices = indices[:max_len-2]
    
    # Add SOS and EOS tokens
    indices = [word_vocab['<SOS>']] + indices + [word_vocab['<EOS>']]
    # Pad to max_len
    padding_length = max_len - len(indices)
    if padding_length > 0:
        indices += [word_vocab['<PAD>']] * padding_length
    
    return indices

# Create word vocabulary
word_vocab = get_word_vocab(train_df['code'])
print(f"Word vocabulary size: {len(word_vocab)}")

# 8. Dataset
class CodeDataset(Dataset):
    def __init__(self, dataframe, char_vocab, word_vocab, max_char_len=512, max_word_len=256):
        self.codes = dataframe['code'].tolist()
        self.labels = dataframe['target'].tolist()
        self.char_vocab = char_vocab
        self.word_vocab = word_vocab
        self.max_char_len = max_char_len
        self.max_word_len = max_word_len

    def __len__(self):
        return len(self.codes)

    def __getitem__(self, idx):
        code = self.codes[idx]
        label = self.labels[idx]
        
        # Character encoding for LSTM
        char_encoded = encode_code(code, self.char_vocab, self.max_char_len)
        
        # Word encoding for transformer
        word_encoded = tokenize_code(code, self.word_vocab, self.max_word_len)
        
        # Create attention mask (1 for tokens, 0 for padding)
        attention_mask = [1 if token != self.word_vocab['<PAD>'] else 0 for token in word_encoded]
        
        return {
            'char_encoded': torch.tensor(char_encoded, dtype=torch.long),
            'word_encoded': torch.tensor(word_encoded, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# 9. Dataloaders
train_dataset = CodeDataset(train_df, vocab, word_vocab)
val_dataset = CodeDataset(val_df, vocab, word_vocab)
test_dataset = CodeDataset(test_df, vocab, word_vocab)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 10. Transformer Encoder Layer
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
        self.activation = nn.GELU()
        
    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        # Self-attention block
        src2, _ = self.self_attn(src, src, src, 
                                key_padding_mask=src_key_padding_mask,
                                attn_mask=src_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        
        # Feedforward block
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        
        return src

# 11. Hybrid LSTM+Transformer Model
class HybridCodeClassifier(nn.Module):
    def __init__(self, char_vocab_size, word_vocab_size, 
                 char_embedding_dim=64, word_embedding_dim=128,
                 lstm_hidden_dim=128, transformer_dim=256, nhead=4, 
                 num_transformer_layers=2, dropout=0.3):
        super(HybridCodeClassifier, self).__init__()
        
        # LSTM part
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(char_embedding_dim, lstm_hidden_dim, num_layers=2, 
                           batch_first=True, dropout=dropout, bidirectional=True)
        self.lstm_dropout = nn.Dropout(dropout)
        
        # Transformer part
        self.word_embedding = nn.Embedding(word_vocab_size, word_embedding_dim, padding_idx=0)
        self.pos_encoder = nn.Embedding(512, word_embedding_dim)  # Position encoding
        self.input_proj = nn.Linear(word_embedding_dim, transformer_dim)
        
        # Transformer layers
        self.transformer_layers = nn.ModuleList([
            TransformerEncoderLayer(transformer_dim, nhead, dim_feedforward=transformer_dim*4, dropout=dropout)
            for _ in range(num_transformer_layers)
        ])
        self.transformer_dropout = nn.Dropout(dropout)
        
        # For feature fusion
        lstm_output_dim = lstm_hidden_dim * 2  # bidirectional
        self.transformer_pooler = nn.Linear(transformer_dim, transformer_dim)
        self.transformer_pooler_activation = nn.Tanh()
        
        # Fully connected layers
        self.fc1 = nn.Linear(transformer_dim + lstm_output_dim, 256)
        self.fc2 = nn.Linear(256, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, char_input, word_input, attention_mask):
        batch_size = char_input.size(0)
        
        # LSTM forward pass
        char_embedded = self.char_embedding(char_input)
        lstm_out, (hidden, _) = self.lstm(char_embedded)
        # Concatenate the final forward and backward hidden states
        lstm_feature = torch.cat((hidden[-2], hidden[-1]), dim=1)
        lstm_feature = self.lstm_dropout(lstm_feature)
        
        # Transformer forward pass
        word_embedded = self.word_embedding(word_input)
        
        # Add positional encoding
        positions = torch.arange(0, word_input.size(1), device=device).unsqueeze(0).expand(batch_size, -1)
        pos_encoded = self.pos_encoder(positions)
        
        # Combine word embeddings with position encodings
        word_embedded = word_embedded + pos_encoded
        
        # Project to transformer dimension
        transformer_input = self.input_proj(word_embedded)
        
        # Create padding mask (1 means padding position)
        padding_mask = (attention_mask == 0)
        
        # Apply transformer layers
        transformer_output = transformer_input
        for layer in self.transformer_layers:
            transformer_output = layer(transformer_output, src_key_padding_mask=padding_mask)
        
        # Pool transformer output (use [CLS] token or average)
        # Using first token as [CLS] equivalent
        transformer_pooled = self.transformer_pooler_activation(
            self.transformer_pooler(transformer_output[:, 0])
        )
        transformer_feature = self.transformer_dropout(transformer_pooled)
        
        # Concatenate features from both models
        combined_features = torch.cat((lstm_feature, transformer_feature), dim=1)
        
        # Final classification
        x = self.relu(self.fc1(combined_features))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

# 12. Initialize model
model = HybridCodeClassifier(
    char_vocab_size=len(vocab),
    word_vocab_size=len(word_vocab)
).to(device)

# 13. Loss and Optimizer
# Use weighted loss to handle class imbalance
pos_weight = torch.tensor(5.0).to(device)  # Adjust weight based on class distribution
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Different learning rates for different components
params = [
    {'params': model.char_embedding.parameters(), 'lr': 1e-3},
    {'params': model.lstm.parameters(), 'lr': 1e-3},
    {'params': model.word_embedding.parameters(), 'lr': 1e-3},
    {'params': model.pos_encoder.parameters(), 'lr': 1e-3},
    {'params': model.transformer_layers.parameters(), 'lr': 5e-4},
    {'params': model.fc1.parameters(), 'lr': 1e-3},
    {'params': model.fc2.parameters(), 'lr': 1e-3}
]
optimizer = torch.optim.AdamW(params, lr=1e-3, weight_decay=0.01)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# 14. Training Function
def train(model, loader, optimizer, criterion, scheduler):
    model.train()
    running_loss = 0.0
    
    for batch in tqdm(loader):
        char_encoded = batch['char_encoded'].to(device)
        word_encoded = batch['word_encoded'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(char_encoded, word_encoded, attention_mask).squeeze(1)
        loss = criterion(outputs, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        running_loss += loss.item()
    
    scheduler.step()
    return running_loss / len(loader)

# 15. Evaluation Function
def evaluate(model, loader):
    model.eval()
    preds = []
    true = []
    
    with torch.no_grad():
        for batch in loader:
            char_encoded = batch['char_encoded'].to(device)
            word_encoded = batch['word_encoded'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(char_encoded, word_encoded, attention_mask).squeeze(1)
            preds.extend(torch.sigmoid(outputs).cpu().numpy())
            true.extend(labels.cpu().numpy())
    
    preds = np.array(preds) >= 0.5
    true = np.array(true)
    
    accuracy = accuracy_score(true, preds)
    report = classification_report(true, preds, digits=4)
    cm = confusion_matrix(true, preds)
    
    return accuracy, report, cm

# 16. Training Loop
epochs = 10
best_val_acc = 0
patience = 3
counter = 0
best_model_state = None

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    
    # Training
    train_loss = train(model, train_loader, optimizer, criterion, scheduler)
    print(f"Train Loss: {train_loss:.4f}")
    
    # Validation
    val_acc, val_report, val_cm = evaluate(model, val_loader)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(val_report)
    print(f"Validation Confusion Matrix:\n{val_cm}")
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        counter = 0
        best_model_state = model.state_dict().copy()
        print(f"Model saved with validation accuracy: {val_acc:.4f}")
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

# 17. Load best model and evaluate on test set
if best_model_state:
    model.load_state_dict(best_model_state)
    print("Successfully loaded the best model")

test_acc, test_report, test_cm = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")
print(test_report)
print(f"Test Confusion Matrix:\n{test_cm}")

# 18. Ensemble predictions with confidence threshold tuning
def predict_with_confidence(model, loader, threshold=0.5):
    model.eval()
    predictions = []
    confidences = []
    true_labels = []
    
    with torch.no_grad():
        for batch in loader:
            char_encoded = batch['char_encoded'].to(device)
            word_encoded = batch['word_encoded'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(char_encoded, word_encoded, attention_mask).squeeze(1)
            probs = torch.sigmoid(outputs).cpu().numpy()
            
            predictions.extend(probs >= threshold)
            confidences.extend(probs)
            true_labels.extend(labels.cpu().numpy())
    
    return np.array(predictions), np.array(confidences), np.array(true_labels)

# Find optimal threshold
thresholds = np.arange(0.3, 0.7, 0.05)
best_threshold = 0.5
best_acc = 0

for threshold in thresholds:
    preds, _, true = predict_with_confidence(model, val_loader, threshold)
    acc = accuracy_score(true, preds)
    if acc > best_acc:
        best_acc = acc
        best_threshold = threshold
        
print(f"Optimal threshold: {best_threshold:.2f} with validation accuracy: {best_acc:.4f}")

# Final evaluation with optimal threshold
final_preds, confidences, true_labels = predict_with_confidence(model, test_loader, best_threshold)
final_acc = accuracy_score(true_labels, final_preds)
final_report = classification_report(true_labels, final_preds, digits=4)
final_cm = confusion_matrix(true_labels, final_preds)

print(f"Final Test Accuracy: {final_acc:.4f}")
print(final_report)
print(f"Final Confusion Matrix:\n{final_cm}")

Using device: cuda
Word vocabulary size: 10000
Epoch 1/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 1.0935
Validation Accuracy: 0.7248
              precision    recall  f1-score   support

         0.0     0.9178    0.4945    0.6427       542
         1.0     0.6536    0.9556    0.7763       541

    accuracy                         0.7248      1083
   macro avg     0.7857    0.7251    0.7095      1083
weighted avg     0.7858    0.7248    0.7094      1083

Validation Confusion Matrix:
[[268 274]
 [ 24 517]]
Model saved with validation accuracy: 0.7248
Epoch 2/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.7996
Validation Accuracy: 0.8680
              precision    recall  f1-score   support

         0.0     0.8771    0.8561    0.8665       542
         1.0     0.8592    0.8799    0.8694       541

    accuracy                         0.8680      1083
   macro avg     0.8682    0.8680    0.8679      1083
weighted avg     0.8682    0.8680    0.8679      1083

Validation Confusion Matrix:
[[464  78]
 [ 65 476]]
Model saved with validation accuracy: 0.8680
Epoch 3/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.6060
Validation Accuracy: 0.8190
              precision    recall  f1-score   support

         0.0     0.9529    0.6716    0.7879       542
         1.0     0.7461    0.9667    0.8422       541

    accuracy                         0.8190      1083
   macro avg     0.8495    0.8192    0.8150      1083
weighted avg     0.8496    0.8190    0.8150      1083

Validation Confusion Matrix:
[[364 178]
 [ 18 523]]
Epoch 4/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.5149
Validation Accuracy: 0.8984
              precision    recall  f1-score   support

         0.0     0.8956    0.9022    0.8989       542
         1.0     0.9013    0.8946    0.8980       541

    accuracy                         0.8984      1083
   macro avg     0.8985    0.8984    0.8984      1083
weighted avg     0.8985    0.8984    0.8984      1083

Validation Confusion Matrix:
[[489  53]
 [ 57 484]]
Model saved with validation accuracy: 0.8984
Epoch 5/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.4071
Validation Accuracy: 0.8957
              precision    recall  f1-score   support

         0.0     0.9459    0.8395    0.8895       542
         1.0     0.8555    0.9519    0.9011       541

    accuracy                         0.8957      1083
   macro avg     0.9007    0.8957    0.8953      1083
weighted avg     0.9008    0.8957    0.8953      1083

Validation Confusion Matrix:
[[455  87]
 [ 26 515]]
Epoch 6/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.3466
Validation Accuracy: 0.9077
              precision    recall  f1-score   support

         0.0     0.9492    0.8616    0.9033       542
         1.0     0.8731    0.9538    0.9117       541

    accuracy                         0.9077      1083
   macro avg     0.9111    0.9077    0.9075      1083
weighted avg     0.9112    0.9077    0.9075      1083

Validation Confusion Matrix:
[[467  75]
 [ 25 516]]
Model saved with validation accuracy: 0.9077
Epoch 7/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.2637
Validation Accuracy: 0.8975
              precision    recall  f1-score   support

         0.0     0.9615    0.8284    0.8900       542
         1.0     0.8490    0.9667    0.9041       541

    accuracy                         0.8975      1083
   macro avg     0.9052    0.8976    0.8970      1083
weighted avg     0.9053    0.8975    0.8970      1083

Validation Confusion Matrix:
[[449  93]
 [ 18 523]]
Epoch 8/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.1801
Validation Accuracy: 0.9086
              precision    recall  f1-score   support

         0.0     0.9511    0.8616    0.9042       542
         1.0     0.8733    0.9556    0.9126       541

    accuracy                         0.9086      1083
   macro avg     0.9122    0.9086    0.9084      1083
weighted avg     0.9123    0.9086    0.9084      1083

Validation Confusion Matrix:
[[467  75]
 [ 24 517]]
Model saved with validation accuracy: 0.9086
Epoch 9/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.1411
Validation Accuracy: 0.9151
              precision    recall  f1-score   support

         0.0     0.9377    0.8893    0.9129       542
         1.0     0.8946    0.9409    0.9171       541

    accuracy                         0.9151      1083
   macro avg     0.9161    0.9151    0.9150      1083
weighted avg     0.9162    0.9151    0.9150      1083

Validation Confusion Matrix:
[[482  60]
 [ 32 509]]
Model saved with validation accuracy: 0.9151
Epoch 10/10


  0%|          | 0/609 [00:00<?, ?it/s]

Train Loss: 0.1144
Validation Accuracy: 0.9178
              precision    recall  f1-score   support

         0.0     0.9381    0.8948    0.9160       542
         1.0     0.8993    0.9409    0.9196       541

    accuracy                         0.9178      1083
   macro avg     0.9187    0.9178    0.9178      1083
weighted avg     0.9187    0.9178    0.9178      1083

Validation Confusion Matrix:
[[485  57]
 [ 32 509]]
Model saved with validation accuracy: 0.9178
Successfully loaded the best model
Test Accuracy: 0.9087
              precision    recall  f1-score   support

         0.0     0.9183    0.8973    0.9077      1353
         1.0     0.8996    0.9202    0.9098      1353

    accuracy                         0.9087      2706
   macro avg     0.9089    0.9087    0.9087      2706
weighted avg     0.9089    0.9087    0.9087      2706

Test Confusion Matrix:
[[1214  139]
 [ 108 1245]]
Optimal threshold: 0.65 with validation accuracy: 0.9215
Final Test Accuracy: 0.9087
          