In [None]:
import pandas as pd 
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class BinaryClassifier(nn.Module):
    def __init__(self, embedding_dim):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 128)  # First fully connected layer
        self.fc2 = nn.Linear(128, 64)             # Second fully connected layer
        self.fc3 = nn.Linear(64, 1)               # Output layer (single output for binary classification)

    def forward(self, x):
        x = F.relu(self.fc1(x))                  # Apply ReLU activation on the first layer
        x = F.relu(self.fc2(x))                  # Apply ReLU activation on the second layer
        x = torch.sigmoid(self.fc3(x))           # Sigmoid activation to get output in range [0, 1]
        return x



In [None]:
from sklearn.metrics import accuracy_score
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for embeddings, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(embeddings).squeeze()
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if epoch % 10 == 0 and False:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

# Testing function
def test_model(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for embeddings, labels in test_loader:
            outputs = model(embeddings).squeeze()
            preds = (outputs > 0.5).float()  # Convert probabilities to binary predictions
            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())
    
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    return predictions, true_labels



In [None]:
def get_data_for_training(data,field):
    embeddings = list()
    labels = list()
    for idx,row in data.iterrows():
        labels.append(row['label'])
        embeddings.append(row[field])
    embeddings = torch.tensor(embeddings,device=device).to(torch.float)
    labels = torch.tensor(labels,device=device)
    print(embeddings.shape,labels.shape)
    return embeddings, labels

In [None]:
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score, accuracy_score


def training_classify(train_df, test_df,filed):
    X_train, y_train = get_data_for_training(train_df,filed)
    X_test, y_test = get_data_for_training(test_df,filed)
    embedding_dim = X_train.shape[-1]
    num_epochs = 50
    batch_size = 32
    learning_rate = 0.001
    
    # Split data into train and test sets
    
    # Data loaders
    train_data = torch.utils.data.TensorDataset(X_train, y_train)
    test_data = torch.utils.data.TensorDataset(X_test, y_test)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)
    results = list()
    precs = list()
    recalls = list()
    f1s = list()
    accs = list()
    for i in range(5):
        model = BinaryClassifier(embedding_dim)
        model = model.to(device)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        
        # Training the model
        train_model(model, train_loader, criterion, optimizer, num_epochs)
        
        # Testing the model
        ps,refs = test_model(model, test_loader)
        tmp_r = recall_score(refs, ps, average='weighted')
        tmp_p = precision_score(refs, ps, average='weighted')
        tmp_f = f1_score(refs, ps, average='weighted')
        tmp_a = accuracy_score(refs, ps)
        precs.append(tmp_p)
        recalls.append(tmp_r)
        f1s.append(tmp_f)
        accs.append(tmp_a)
        results.append(classification_report(refs, ps))
    for el in results:
        print(el)
    
    print('recall',recalls)
    print('precision',precs)
    print('f1',f1s)
    print('acc',accs)
    print(f'{sum(accs)/5}\t{sum(precs)/5}\t{sum(recalls)/5}\t{sum(f1s)/5}')


In [None]:
languages = ['cpp','cs','java','js','php','python','sh','ts','cross_task']
fileds = ['first_token_embedding'	,'last_token_embedding'	,'first_token_code_embedding'	,'last_token_code_embedding']
for lang in languages:
    test_df = pd.read_parquet(f'/kaggle/input/data-for-classify-ds67/data_for_classify_ds67/{lang}/test.parquet')
    train_df = pd.read_parquet(f'/kaggle/input/data-for-classify-ds67/data_for_classify_ds67/{lang}/train.parquet')
    for filed in fileds:
        print(f'processing: {lang} with {filed}')
        training_classify(train_df,test_df,filed)