### Defect prediction using Random Forest

In [2]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the dataset
dataset = load_dataset("petersa2/CodeNet", split="train")

# Prepare data: Get source code and correctness label
# Use the 'text' column for code and 'label' for the status
codes = [sample["text"] for sample in dataset]
labels = [sample["label"] for sample in dataset]

# Text vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(codes)
y = labels

# Split and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78       119
           1       0.48      0.48      0.48        50

    accuracy                           0.69       169
   macro avg       0.63      0.63      0.63       169
weighted avg       0.69      0.69      0.69       169



### Code Classification using LSTM(Long Short Term Memory)+PyTorch

In [18]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Step 1: Load dataset
dataset = load_dataset("petersa2/CodeNet", split="train")
codes = [sample["text"] for sample in dataset]
labels = [sample["label"] for sample in dataset]

# Step 2: Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Step 3: Tokenize code manually
def simple_tokenizer(code):
    return code.lower().split()

tokenized_codes = [simple_tokenizer(code) for code in codes]

# Step 4: Build vocabulary
from collections import Counter
vocab = Counter(token for code in tokenized_codes for token in code)
vocab = {token: idx+2 for idx, (token, _) in enumerate(vocab.most_common(10000))}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

# Step 5: Convert tokens to indices
def encode(code, vocab, max_len=300):
    tokens = [vocab.get(token, vocab["<UNK>"]) for token in code]
    if len(tokens) < max_len:
        tokens += [vocab["<PAD>"]] * (max_len - len(tokens))
    else:
        tokens = tokens[:max_len]
    return tokens

X = np.array([encode(code, vocab) for code in tokenized_codes])
y = np.array(y)

# Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# Step 8: Define LSTM model
class CodeClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(CodeClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        x = self.relu(self.fc1(hidden.squeeze(0)))
        return self.fc2(x)

# Step 9: Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CodeClassifier(vocab_size=len(vocab), embed_dim=128, hidden_dim=64, output_dim=len(set(y)))
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 10: Train model
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Step 11: Evaluate
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x = batch_x.to(device)
        outputs = model(batch_x)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(batch_y.numpy())

print("\n📊 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_.astype(str)))

Epoch 1, Loss: 7.5401
Epoch 2, Loss: 7.1646
Epoch 3, Loss: 6.8648
Epoch 4, Loss: 6.6971
Epoch 5, Loss: 6.4959

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.97      0.82       119
           1       0.43      0.06      0.11        50

    accuracy                           0.70       169
   macro avg       0.57      0.51      0.46       169
weighted avg       0.63      0.70      0.61       169

