In [3]:
!pip install --no-deps /kaggle/input/bnb-pip/bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl

Processing /kaggle/input/bnb-pip/bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [4]:
import bitsandbytes as bnb
print("bitsandbytes", bnb.__version__)

bitsandbytes 0.47.0


In [5]:
import torch
torch.cuda.is_available()

True

# Prepare Training Data

In [1]:
import pandas as pd

train = pd.read_parquet('/kaggle/input/sem-eval-2026-task-13-subtask-a/Task_A/train.parquet')
# test = pd.read_parquet('/kaggle/input/sem-eval-2026-task-13-subtask-a/Task_A/test.parquet')

langs = train['language'].unique()
# gens = train['generator'].unique()

df_trains = []
fold = 0
max_len = 300
for lang in langs:
    fold += 1
    df = train[train['language'] == lang].reset_index(drop=True)
    data = []
    for _, row in df.iterrows():
        code = row['code'].strip()

        if len(code) > max_len:
            continue
            
        target = row['label']

        text = f"""<|im_start|>user
You are a professional code analyst. Your task is to decide if the following {lang} code snippet was written by an AI model. Respond only with "Yes" or "No".

Here is the code snippet to classify:
"{code}"

Answer "Yes" if it is AI-generated, otherwise "No".<|im_end|>
<|im_start|>assistant
<think>
</think>
Answer:"""

        data.append([code, lang, text, target])
            
    df_train = pd.DataFrame(data, columns=['code', 'lang', 'text', 'target'])
    df_trains.append(df_train)

df_train = pd.concat(df_trains, axis=0, ignore_index=True)
df_train.to_csv(f'train_full.csv', index=False)

In [2]:
pd.set_option('display.max_colwidth', None)
print(df_train['text'][1])

<|im_start|>user
You are a professional code analyst. Your task is to decide if the following Python code snippet was written by an AI model. Respond only with "Yes" or "No".

Here is the code snippet to classify:
"T = int(input())
for t in range(T):
	color = input().split()
	s1 = color[:2]
	s2 = color[2:4]
	s3 = color[4:6]
	bo = False
	for i in range(2):
		for j in range(2):
			for k in range(2):
				if s1[i] == s2[j] == s3[k]:
					bo = True
	if bo:
		print('YES')
	else:
		print('NO')"

Answer "Yes" if it is AI-generated, otherwise "No".<|im_end|>
<|im_start|>assistant
<think>
</think>
Answer:


# Deep Mutual Learning
Paper: https://arxiv.org/pdf/1706.00384

In [6]:
%%writefile train.py
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import time
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.amp import GradScaler

from transformers import AutoModel, AutoTokenizer, AutoConfig, get_cosine_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType
from transformers import BitsAndBytesConfig

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# ----------------------
# Config
# ----------------------

model_path_a = '/kaggle/input/qwen2.5-coder/transformers/7b/1'
model_path_b = '/kaggle/input/qwen2.5-coder/transformers/3b/1'
model_path_c = '/kaggle/input/qwen2.5-coder/transformers/1.5b/1'
    
lambda_dml = 1           # weight for KL mutual distillation, follows https://arxiv.org/pdf/1706.00384
num_epochs = 3
batch_size = 2
gradient_accumulation_steps = 4
seed = 252

device_a = torch.device("cuda:0")
device_b = torch.device("cuda:1")
device_c = torch.device('cuda:1')

# ----------------------
# Tokenizers (separate for each model)
# ----------------------
tokenizer_a = AutoTokenizer.from_pretrained(model_path_a)
tokenizer_b = AutoTokenizer.from_pretrained(model_path_b)
tokenizer_c = AutoTokenizer.from_pretrained(model_path_c)

tokenizer_a.padding_side = 'left'
tokenizer_b.padding_side = 'left'
tokenizer_c.padding_side = 'left'

# ----------------------
# Utilities
# ----------------------
def set_seed(seed=318):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class SemEvalDataset(Dataset):
    def __init__(self, prompts, targets):
        self.prompts = prompts
        self.targets = targets

    def __getitem__(self, idx):
        return self.prompts[idx], self.targets[idx]

    def __len__(self):
        return len(self.targets)

# ----------------------
# Model wrapper
# ----------------------
class Net(nn.Module):
    def __init__(self, model_path, device_index):
        super(Net, self).__init__()
        self.config = AutoConfig.from_pretrained(model_path)

        # 4-bit quantization config
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

        # Backbone loaded directly onto device_index
        self.backbone = AutoModel.from_pretrained(
            model_path,
            use_cache=False,
            torch_dtype=torch.float16,
            quantization_config=bnb_config,
            device_map=device_index
        )

        # LoRA for feature extraction
        peft_config = LoraConfig(
            task_type=TaskType.FEATURE_EXTRACTION,
            target_modules='all-linear',
            bias='none',
            inference_mode=False,
            r=8,
            lora_alpha=16,
            lora_dropout=0.
        )
        self.backbone = get_peft_model(self.backbone, peft_config)

        # simple classification head
        self.head = nn.Linear(self.config.hidden_size, 2, bias=False)

    def forward(self, encodings):
        out = self.backbone(**encodings).last_hidden_state  # (B, L, H)
        x = out[:, -1, :]  # use last token
        return self.head(x)  # (B, 2)

# ----------------------
# Optimizer helper
# ----------------------
def get_optimizer(model, learning_rate=2e-4, differential_lr=2e-4, weight_decay=0.01):
    no_decay = ['bias', 'LayerNorm.weight']
    differential_layers = ['backbone']

    optimizer_grouped_parameters = [
        {
            "params": [
                param for name, param in model.named_parameters()
                if (not any(layer in name for layer in differential_layers))
                and (not any(nd in name for nd in no_decay))
            ],
            "lr": learning_rate,
            "weight_decay": weight_decay,
        },
        {
            "params": [
                param for name, param in model.named_parameters()
                if (not any(layer in name for layer in differential_layers))
                and (any(nd in name for nd in no_decay))
            ],
            "lr": learning_rate,
            "weight_decay": 0,
        },
        {
            "params": [
                param for name, param in model.named_parameters()
                if (any(layer in name for layer in differential_layers))
                and (not any(nd in name for nd in no_decay))
            ],
            "lr": differential_lr,
            "weight_decay": weight_decay,
        },
        {
            "params": [
                param for name, param in model.named_parameters()
                if (any(layer in name for layer in differential_layers))
                and (any(nd in name for nd in no_decay))
            ],
            "lr": differential_lr,
            "weight_decay": 0,
        },
    ]
    return torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate, weight_decay=weight_decay)

def kl_div_symmetric(logits_src, logits_t1, logits_t2, device_src):
    target = 0.5 * F.softmax(logits_t1.detach().to(device_src, non_blocking=True), dim=-1) + \
             0.5 * F.softmax(logits_t2.detach().to(device_src, non_blocking=True), dim=-1)
    return F.kl_div(
        F.log_softmax(logits_src, dim=-1),
        target,
        reduction='batchmean'
    )
    
# ----------------------
# Training (single process, two GPUs, full dataset, no validation)
# ----------------------
def train_singleprocess(num_epochs):
    # Load full training data
    train = pd.read_csv('/kaggle/working/train_full.csv')
    train = train.drop_duplicates(['text']).reset_index(drop=True)
    
    # if len(pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')) < 100:
    #     train = train.head(10)

    train = train.head(100)
    
    train_prompts = train['text'].tolist()
    train_targets = train['target'].astype(int).tolist()

    print('Train size:', len(train_prompts))

    train_dataset = SemEvalDataset(train_prompts, train_targets)
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        drop_last=True
    )

    # Compute MAX_LEN from training prompts for each tokenizer (99th percentile) and pick max
    max_len_a = int(np.quantile([len(tokenizer_a(x).input_ids) for x in train_prompts], q=0.99))
    max_len_b = int(np.quantile([len(tokenizer_b(x).input_ids) for x in train_prompts], q=0.99))
    max_len_c = int(np.quantile([len(tokenizer_c(x).input_ids) for x in train_prompts], q=0.99))

    MAX_LEN = max(max_len_a, max_len_b, max_len_c)
    print('Max Len A/B/C', max_len_a, max_len_b, max_len_c, '=> use', MAX_LEN)

    set_seed(seed)

    # Instantiate two models on separate devices
    model_a = Net(model_path_a, 0)   # device_map=0 -> cuda:0
    model_b = Net(model_path_b, 1)   # device_map=1 -> cuda:1
    model_c = Net(model_path_c, 1)   # device_map=1 -> cuda:1

    # Move classification heads to devices explicitly
    model_a.head = model_a.head.to(device_a)
    model_b.head = model_b.head.to(device_b)
    model_c.head = model_c.head.to(device_c)

    optimizer_a = get_optimizer(model_a, learning_rate=2e-4, differential_lr=2e-4, weight_decay=0.01)
    optimizer_b = get_optimizer(model_b, learning_rate=2e-4, differential_lr=2e-4, weight_decay=0.01)
    optimizer_c = get_optimizer(model_c, learning_rate=2e-4, differential_lr=2e-4, weight_decay=0.01)

    # Training steps calculation
    num_update_steps_per_epoch = max(1, len(train_loader) // gradient_accumulation_steps)
    max_train_steps = num_update_steps_per_epoch * num_epochs

    scheduler_a = get_cosine_schedule_with_warmup(optimizer_a, num_warmup_steps=0, num_training_steps=max_train_steps)
    scheduler_b = get_cosine_schedule_with_warmup(optimizer_b, num_warmup_steps=0, num_training_steps=max_train_steps)
    scheduler_c = get_cosine_schedule_with_warmup(optimizer_c, num_warmup_steps=0, num_training_steps=max_train_steps)

    scaler = GradScaler() # single scaler is enought (https://discuss.pytorch.org/t/gradient-scaling-with-multiple-scalers/175976/2)


    for epoch in range(num_epochs):
        model_a.train()
        model_b.train()
        model_c.train()

        pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch+1}')
        for step, (batch_prompts, batch_targets) in pbar:
            # Tokenize separately for each model
            enc_a = tokenizer_a(
                batch_prompts,
                return_tensors='pt',
                padding='longest',
                truncation=True,
                max_length=MAX_LEN
            )
            enc_b = tokenizer_b(
                batch_prompts,
                return_tensors='pt',
                padding='longest',
                truncation=True,
                max_length=MAX_LEN
            )

            enc_c = tokenizer_c(
                batch_prompts,
                return_tensors='pt',
                padding='longest',
                truncation=True,
                max_length=MAX_LEN
            )
            
            # Move to respective devices
            enc_a = {k: v.to(device_a, non_blocking=True) for k, v in enc_a.items()}
            enc_b = {k: v.to(device_b, non_blocking=True) for k, v in enc_b.items()}
            enc_c = {k: v.to(device_c, non_blocking=True) for k, v in enc_c.items()}

            # labels = torch.tensor(batch_targets, dtype=torch.long)
            labels = torch.as_tensor(batch_targets, dtype=torch.long)
            labels_a = labels.to(device_a, non_blocking=True)
            labels_b = labels.to(device_b, non_blocking=True)
            labels_c = labels.to(device_c, non_blocking=True)

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Forward passes
                logits_a = model_a(enc_a)
                logits_b = model_b(enc_b)
                logits_c = model_c(enc_c)
                
                # Supervised cross-entropy losses
                ce_a = F.cross_entropy(logits_a, labels_a)
                ce_b = F.cross_entropy(logits_b, labels_b)
                ce_c = F.cross_entropy(logits_c, labels_c)


                kl_a_bc = kl_div_symmetric(logits_a, logits_b, logits_c, device_a)
                kl_b_ac = kl_div_symmetric(logits_b, logits_a, logits_c, device_b)
                kl_c_ab = kl_div_symmetric(logits_c, logits_a, logits_b, device_c)

                # Total losses per model
                loss_a = ce_a + lambda_dml * kl_a_bc
                loss_b = ce_b + lambda_dml * kl_b_ac
                loss_c = ce_c + lambda_dml * kl_c_ab

                # Scale for gradient accumulation
                loss_a = loss_a / gradient_accumulation_steps
                loss_b = loss_b / gradient_accumulation_steps
                loss_c = loss_c / gradient_accumulation_steps

            # Backward passes — one per model on its own device
            scaler.scale(loss_a).backward()
            scaler.scale(loss_b).backward()
            scaler.scale(loss_c).backward()

            # Optimizer step
            if (step + 1) % gradient_accumulation_steps == 0:
                # Unscale before clipping
                scaler.unscale_(optimizer_a)
                scaler.unscale_(optimizer_b)
                scaler.unscale_(optimizer_c)

                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model_a.parameters(), max_norm=10.0)
                torch.nn.utils.clip_grad_norm_(model_b.parameters(), max_norm=10.0)
                torch.nn.utils.clip_grad_norm_(model_c.parameters(), max_norm=10.0)

                # ---- Step for Model A ----
                scaler.step(optimizer_a)
                scheduler_a.step()
                optimizer_a.zero_grad(set_to_none=True)
            
                # ---- Step for Model B ----
                scaler.step(optimizer_b)
                scheduler_b.step()
                optimizer_b.zero_grad(set_to_none=True)

             # ---- Step for Model C ----
                scaler.step(optimizer_c)
                scheduler_c.step()
                optimizer_c.zero_grad(set_to_none=True)
                
                # ---- Update scaler (once) ----
                scaler.update()


            # Logging
            if step % 10 == 0 or step == len(train_loader) - 1:
                pbar.set_postfix({
                    'loss_a': float(loss_a.item() * gradient_accumulation_steps),
                    'loss_b': float(loss_b.item() * gradient_accumulation_steps),
                    'loss_c': float(loss_c.item() * gradient_accumulation_steps),

                    'ce_a': float(ce_a.item()),
                    'ce_b': float(ce_b.item()),
                    'ce_c': float(ce_c.item()),

                    'kl_a_bc': float(kl_a_bc.item()),
                    'kl_b_ac': float(kl_b_ac.item()),
                    'kl_c_ab': float(kl_c_ab.item())
                })

    # Save final models
    model_a.backbone.save_pretrained('backbone_a')
    torch.save(model_a.head.state_dict(), 'head_a.pt')
    
    model_b.backbone.save_pretrained('backbone_b')
    torch.save(model_b.head.state_dict(), 'head_b.pt')

    model_c.backbone.save_pretrained('backbone_c')
    torch.save(model_c.head.state_dict(), 'head_c.pt')
    
    print('Training finished')


# ----------------------
# Main
# ----------------------
if __name__ == '__main__':
    print("PyTorch version:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("Number of GPUs available:", torch.cuda.device_count())

    if torch.cuda.device_count() < 2:
        raise RuntimeError("This script expects at least 2 GPUs (cuda:0 and cuda:1).")

    train_singleprocess(num_epochs)

Writing train.py


In [7]:
!python train.py

2025-11-07 02:50:08.810164: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762483808.980523     108 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762483809.029759     108 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
PyTorch version: 2.6.0+cu124
CUDA available: True
Number of GPUs available: 2
Train size: 100
Max Len A/B/C 195 195 195 => use 195
Loading checkpoint shards: 100%|██████████████████| 4/4 [02:07<00:00, 31.99s/it]
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:55<00:00, 27.95s/it]
Epoch 1: 100%|█| 50/50 [01:13<00:00,  1.48s/it, loss_a=0.844, loss_b=1.07, loss_
Epoch 2: 100%|█| 50/50 [01:14<00:00,  1.49s/it, loss_a=0.509, l

# Multi-GPU Inference

In [9]:
test = pd.read_parquet('/kaggle/input/sem-eval-2026-task-13-subtask-a/Task_A/test.parquet')
test.columns

Index(['ID', 'code'], dtype='object')

In [12]:
%%writefile inference.py

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import gc
import ctypes
import math
import numpy as np
import pandas as pd
from threading import Thread
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torch.amp import autocast

from transformers import AutoTokenizer, AutoConfig, BitsAndBytesConfig
from peft import AutoPeftModelForFeatureExtraction

# -----------------------
# Basic environment setup
# -----------------------
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

def clean_memory(deep=True):
    gc.collect()
    if deep:
        try:
            ctypes.CDLL("libc.so.6").malloc_trim(0)
        except Exception:
            pass
    torch.cuda.empty_cache()


# =====================
# 1) Read + prepare text
# =====================
test = pd.read_parquet('/kaggle/input/sem-eval-2026-task-13-subtask-a/Task_A/test.parquet')

test_texts = []
for code in test['code']:
    code_snippet = code.strip()
    prompt = f"""<|im_start|>user
You are a professional code analyst. Your task is to decide if the following code snippet was written by an AI model. Respond only with "Yes" or "No".

Now, here is the code snippet to classify:
"{code_snippet}"

Answer "Yes" if it is AI-generated, otherwise "No".<|im_end|>
<|im_start|>assistant
<think>

</think>

Answer:"""
    test_texts.append(prompt)

test['text'] = test_texts
test['target'] = -100

# =====================
# 2) Tokenizer + lengths
# =====================

model_path = '/kaggle/input/qwen2.5-coder/transformers/7b/1'
    
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.padding_side = 'left'

# compute tokenized lengths (this may be slow but necessary for binning)
test['len'] = test['text'].apply(lambda x: len(tokenizer(x).input_ids))

# =====================
# 3) Bin-balanced sampling + sort within split
# =====================
# Create quantile bins by length (e.g. 10 bins). Adjust q if you want finer/coarser bins.
n_bins = 10
# qcut can produce fewer bins if duplicates exist; handle duplicates='drop'
test['len_bin'] = pd.qcut(test['len'], q=n_bins, labels=False, duplicates='drop')

# Prepare lists collecting halves from each bin
test_0_list = []
test_1_list = []

# For reproducibility
RANDOM_STATE = 42

for b in sorted(test['len_bin'].unique()):
    bin_df = test[test['len_bin'] == b].copy()
    # shuffle within bin to avoid positional bias
    bin_df = bin_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    mid = len(bin_df) // 2
    test_0_list.append(bin_df.iloc[:mid])
    test_1_list.append(bin_df.iloc[mid:])

# Concatenate per-GPU datasets and sort within each split ascending by length to optimize padding
test_0 = pd.concat(test_0_list, ignore_index=True).sort_values('len', ascending=True).reset_index(drop=True)
test_1 = pd.concat(test_1_list, ignore_index=True).sort_values('len', ascending=True).reset_index(drop=True)

# If one side is missing some remainder because of odd counts, ensure everyone covered
# Combine leftovers (rare) - but our split above already assigns all rows.
# Just sanity-check lengths & tokens
print("GPU0 rows:", len(test_0), "mean_len:", test_0['len'].mean(), "total_tokens:", int(test_0['len'].sum()))
print("GPU1 rows:", len(test_1), "mean_len:", test_1['len'].mean(), "total_tokens:", int(test_1['len'].sum()))
print("Total rows covered:", len(test_0) + len(test_1), "original:", len(test))

# =====================
# 4) Dataset + DataLoader (include original index so we can place preds correctly)
# =====================
class SemEvalDataset(Dataset):
    def __init__(self, df):
        # df must contain columns: 'text', 'target', 'ID' (or original index)
        self.texts = df['text'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.row_ids = df['ID'].to_numpy()

    def __getitem__(self, idx):
        # return prompt (string), target (int), original row id
        return self.texts[idx], int(self.targets[idx]), int(self.row_ids[idx])

    def __len__(self):
        return len(self.targets)


batch_size = 16

dataset_0 = SemEvalDataset(test_0)
dataset_1 = SemEvalDataset(test_1)

dataloader_0 = DataLoader(dataset_0, batch_size=batch_size, shuffle=False, drop_last=False)
dataloader_1 = DataLoader(dataset_1, batch_size=batch_size, shuffle=False, drop_last=False)

test_dataloaders = [dataloader_0, dataloader_1]

# =====================
# 5) Model definition
# =====================
class Net(nn.Module):
    def __init__(self, base_model_path, trained_backbone_path, load_in_device):
        super(Net, self).__init__()

        self.config = AutoConfig.from_pretrained(base_model_path)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

        self.backbone = AutoPeftModelForFeatureExtraction.from_pretrained(
            trained_backbone_path,
            use_cache=False,
            torch_dtype=torch.float16,
            quantization_config=bnb_config,
            device_map=load_in_device
        )

        self.head = nn.Linear(self.config.hidden_size, 2, bias=False)

    def forward(self, x):
        # x is tokenized dict sent directly to backbone
        x = self.backbone(**x).last_hidden_state[:, -1, :]
        logits = self.head(x)
        return logits


# =====================
# 6) Load models on each GPU
# =====================
trained_backbone_path = '/kaggle/working/backbone_a'
trained_head_path = '/kaggle/working/head_a.pt'

model_1 = Net(base_model_path=model_path,
              trained_backbone_path=trained_backbone_path,
              load_in_device='cuda:0')

model_2 = Net(base_model_path=model_path,
              trained_backbone_path=trained_backbone_path,
              load_in_device='cuda:1')

# load head states (weights_only=True used originally; keep same behavior)
model_1.head.load_state_dict(torch.load(trained_head_path, weights_only=True))
model_2.head.load_state_dict(torch.load(trained_head_path, weights_only=True))

# ensure heads on correct device
model_1.head.to('cuda:0')
model_2.head.to('cuda:1')

# Put models into eval mode (backbone's weights are already on the correct device via device_map)
model_1.eval()
model_2.eval()

# =====================
# 7) Inference function (collect logits + row_ids to reconstruct order)
# =====================
def get_preds(model, tokenizer, dataloader, device, results_dict):
    """
    Runs inference for the provided dataloader on `device`.
    Writes to results_dict a list of (row_id_array, logits_array) tuples in order processed.
    """
    device_short = device  # e.g., 'cuda:0'
    collected_row_ids = []
    collected_logits = []

    with torch.no_grad():
        for batch in tqdm(dataloader, total=len(dataloader)):
            batch_prompts, batch_targets, batch_row_ids = batch  # prompts: list[str]
            # Tokenize batch (returns tensors on CPU)
            encodings = tokenizer(list(batch_prompts), return_tensors='pt', padding='longest', truncation=False)
            # Move to device with pin_memory + non_blocking
            encodings = {k: v.to(device_short, non_blocking=True) for k, v in encodings.items()}

            with autocast(device_type='cuda'):
                logits = model(encodings)  # should be (B, 2)

            # bring logits back to CPU and store along with row_ids
            collected_logits.append(logits.detach().cpu())
            collected_row_ids.append(torch.as_tensor(batch_row_ids))

    if len(collected_logits) > 0:
        results_dict[device_short] = (torch.cat(collected_row_ids).numpy(), torch.cat(collected_logits).numpy())
    else:
        results_dict[device_short] = (np.array([], dtype=int), np.zeros((0, 2), dtype=np.float32))


# =====================
# 8) Run threaded inference on both GPUs
# =====================
results = {}

t0 = Thread(target=get_preds, args=(model_1, tokenizer, test_dataloaders[0], 'cuda:0', results))
t1 = Thread(target=get_preds, args=(model_2, tokenizer, test_dataloaders[1], 'cuda:1', results))

t0.start()
t1.start()

t0.join()
t1.join()

# =====================
# 9) Reconstruct full logits array in original row order
# =====================
n_total = len(test)
# We'll create a logits array indexed by original row_id values.
# If row_id values are not 0..N-1, map them via a lookup.
row_ids_all = np.concatenate([results['cuda:0'][0], results['cuda:1'][0]])
logits_all = np.concatenate([results['cuda:0'][1], results['cuda:1'][1]], axis=0)

# create a mapping row_id -> logits
# If row_id is unique, we can use a dict or numpy indexing
# We'll construct a DataFrame and then join with original test rows to ensure ordering
pred_df = pd.DataFrame({
    'ID': row_ids_all,
    'logit_0': logits_all[:, 0],
    'logit_1': logits_all[:, 1],
})

# Merge predictions back to the original test dataframe (which still has row_id col)
# Use left join on row_id to preserve original input order, then sort by row_id if needed
merged = test[['ID']].merge(pred_df, on='ID', how='left')

# Some sanity checks
if merged[['logit_0', 'logit_1']].isnull().any().any():
    # if any missing predictions, fill with zeros (shouldn't happen)
    merged[['logit_0', 'logit_1']] = merged[['logit_0', 'logit_1']].fillna(0.0)
    print("Warning: some rows missing predictions and were filled with zeros.")

logits_tensor = torch.as_tensor(merged[['logit_0', 'logit_1']].values, dtype=torch.float32)
pred_probs = F.softmax(logits_tensor, dim=-1)
pred_labels = torch.argmax(pred_probs, dim=-1).numpy()

# =====================
# 10) Cleanup + write submission
# =====================
del model_1, model_2
clean_memory()

sub = test[['ID']].copy()
# sub['prediction'] = pred_probs[:, 1].numpy()
sub['prediction'] = pred_labels
sub = sub.sort_values('ID')
sub.to_csv('sub_a.csv', index=False)

Overwriting inference.py


In [13]:
!python inference.py

2025-11-07 03:18:25.267593: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762485505.291582     172 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762485505.298669     172 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
GPU0 rows: 498 mean_len: 395.67670682730926 total_tokens: 197047
GPU1 rows: 502 mean_len: 404.5278884462151 total_tokens: 203073
Total rows covered: 1000 original: 1000
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:20<00:00,  5.19s/it]
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:21<00:00,  5.26s/it]
  0%|                                                    | 0/32 [00:00<?, ?it/s]
  0%|                    

In [19]:
%%writefile inference.py

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import gc
import ctypes
import math
import numpy as np
import pandas as pd
from threading import Thread
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torch.amp import autocast

from transformers import AutoTokenizer, AutoConfig, BitsAndBytesConfig
from peft import AutoPeftModelForFeatureExtraction

# -----------------------
# Basic environment setup
# -----------------------
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

def clean_memory(deep=True):
    gc.collect()
    if deep:
        try:
            ctypes.CDLL("libc.so.6").malloc_trim(0)
        except Exception:
            pass
    torch.cuda.empty_cache()


# =====================
# 1) Read + prepare text
# =====================
test = pd.read_parquet('/kaggle/input/sem-eval-2026-task-13-subtask-a/Task_A/test.parquet')

test_texts = []
for code in test['code']:
    code_snippet = code.strip()
    prompt = f"""<|im_start|>user
You are a professional code analyst. Your task is to decide if the following code snippet was written by an AI model. Respond only with "Yes" or "No".

Now, here is the code snippet to classify:
"{code_snippet}"

Answer "Yes" if it is AI-generated, otherwise "No".<|im_end|>
<|im_start|>assistant
<think>

</think>

Answer:"""
    test_texts.append(prompt)

test['text'] = test_texts
test['target'] = -100

# =====================
# 2) Tokenizer + lengths
# =====================

model_path = '/kaggle/input/qwen2.5-coder/transformers/3b/1'
    
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.padding_side = 'left'

# compute tokenized lengths (this may be slow but necessary for binning)
test['len'] = test['text'].apply(lambda x: len(tokenizer(x).input_ids))

# =====================
# 3) Bin-balanced sampling + sort within split
# =====================
# Create quantile bins by length (e.g. 10 bins). Adjust q if you want finer/coarser bins.
n_bins = 10
# qcut can produce fewer bins if duplicates exist; handle duplicates='drop'
test['len_bin'] = pd.qcut(test['len'], q=n_bins, labels=False, duplicates='drop')

# Prepare lists collecting halves from each bin
test_0_list = []
test_1_list = []

# For reproducibility
RANDOM_STATE = 42

for b in sorted(test['len_bin'].unique()):
    bin_df = test[test['len_bin'] == b].copy()
    # shuffle within bin to avoid positional bias
    bin_df = bin_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    mid = len(bin_df) // 2
    test_0_list.append(bin_df.iloc[:mid])
    test_1_list.append(bin_df.iloc[mid:])

# Concatenate per-GPU datasets and sort within each split ascending by length to optimize padding
test_0 = pd.concat(test_0_list, ignore_index=True).sort_values('len', ascending=True).reset_index(drop=True)
test_1 = pd.concat(test_1_list, ignore_index=True).sort_values('len', ascending=True).reset_index(drop=True)

# If one side is missing some remainder because of odd counts, ensure everyone covered
# Combine leftovers (rare) - but our split above already assigns all rows.
# Just sanity-check lengths & tokens
print("GPU0 rows:", len(test_0), "mean_len:", test_0['len'].mean(), "total_tokens:", int(test_0['len'].sum()))
print("GPU1 rows:", len(test_1), "mean_len:", test_1['len'].mean(), "total_tokens:", int(test_1['len'].sum()))
print("Total rows covered:", len(test_0) + len(test_1), "original:", len(test))

# =====================
# 4) Dataset + DataLoader (include original index so we can place preds correctly)
# =====================
class SemEvalDataset(Dataset):
    def __init__(self, df):
        # df must contain columns: 'text', 'target', 'ID' (or original index)
        self.texts = df['text'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.row_ids = df['ID'].to_numpy()

    def __getitem__(self, idx):
        # return prompt (string), target (int), original row id
        return self.texts[idx], int(self.targets[idx]), int(self.row_ids[idx])

    def __len__(self):
        return len(self.targets)


batch_size = 16

dataset_0 = SemEvalDataset(test_0)
dataset_1 = SemEvalDataset(test_1)

dataloader_0 = DataLoader(dataset_0, batch_size=batch_size, shuffle=False, drop_last=False)
dataloader_1 = DataLoader(dataset_1, batch_size=batch_size, shuffle=False, drop_last=False)

test_dataloaders = [dataloader_0, dataloader_1]

# =====================
# 5) Model definition
# =====================
class Net(nn.Module):
    def __init__(self, base_model_path, trained_backbone_path, load_in_device):
        super(Net, self).__init__()

        self.config = AutoConfig.from_pretrained(base_model_path)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

        self.backbone = AutoPeftModelForFeatureExtraction.from_pretrained(
            trained_backbone_path,
            use_cache=False,
            torch_dtype=torch.float16,
            quantization_config=bnb_config,
            device_map=load_in_device
        )

        self.head = nn.Linear(self.config.hidden_size, 2, bias=False)

    def forward(self, x):
        # x is tokenized dict sent directly to backbone
        x = self.backbone(**x).last_hidden_state[:, -1, :]
        logits = self.head(x)
        return logits


# =====================
# 6) Load models on each GPU
# =====================
trained_backbone_path = '/kaggle/working/backbone_b'
trained_head_path = '/kaggle/working/head_b.pt'

model_1 = Net(base_model_path=model_path,
              trained_backbone_path=trained_backbone_path,
              load_in_device='cuda:0')

model_2 = Net(base_model_path=model_path,
              trained_backbone_path=trained_backbone_path,
              load_in_device='cuda:1')

# load head states (weights_only=True used originally; keep same behavior)
model_1.head.load_state_dict(torch.load(trained_head_path, weights_only=True))
model_2.head.load_state_dict(torch.load(trained_head_path, weights_only=True))

# ensure heads on correct device
model_1.head.to('cuda:0')
model_2.head.to('cuda:1')

# Put models into eval mode (backbone's weights are already on the correct device via device_map)
model_1.eval()
model_2.eval()

# =====================
# 7) Inference function (collect logits + row_ids to reconstruct order)
# =====================
def get_preds(model, tokenizer, dataloader, device, results_dict):
    """
    Runs inference for the provided dataloader on `device`.
    Writes to results_dict a list of (row_id_array, logits_array) tuples in order processed.
    """
    device_short = device  # e.g., 'cuda:0'
    collected_row_ids = []
    collected_logits = []

    with torch.no_grad():
        for batch in tqdm(dataloader, total=len(dataloader)):
            batch_prompts, batch_targets, batch_row_ids = batch  # prompts: list[str]
            # Tokenize batch (returns tensors on CPU)
            encodings = tokenizer(list(batch_prompts), return_tensors='pt', padding='longest', truncation=False)
            # Move to device with pin_memory + non_blocking
            encodings = {k: v.to(device_short, non_blocking=True) for k, v in encodings.items()}

            with autocast(device_type='cuda'):
                logits = model(encodings)  # should be (B, 2)

            # bring logits back to CPU and store along with row_ids
            collected_logits.append(logits.detach().cpu())
            collected_row_ids.append(torch.as_tensor(batch_row_ids))

    if len(collected_logits) > 0:
        results_dict[device_short] = (torch.cat(collected_row_ids).numpy(), torch.cat(collected_logits).numpy())
    else:
        results_dict[device_short] = (np.array([], dtype=int), np.zeros((0, 2), dtype=np.float32))


# =====================
# 8) Run threaded inference on both GPUs
# =====================
results = {}

t0 = Thread(target=get_preds, args=(model_1, tokenizer, test_dataloaders[0], 'cuda:0', results))
t1 = Thread(target=get_preds, args=(model_2, tokenizer, test_dataloaders[1], 'cuda:1', results))

t0.start()
t1.start()

t0.join()
t1.join()

# =====================
# 9) Reconstruct full logits array in original row order
# =====================
n_total = len(test)
# We'll create a logits array indexed by original row_id values.
# If row_id values are not 0..N-1, map them via a lookup.
row_ids_all = np.concatenate([results['cuda:0'][0], results['cuda:1'][0]])
logits_all = np.concatenate([results['cuda:0'][1], results['cuda:1'][1]], axis=0)

# create a mapping row_id -> logits
# If row_id is unique, we can use a dict or numpy indexing
# We'll construct a DataFrame and then join with original test rows to ensure ordering
pred_df = pd.DataFrame({
    'ID': row_ids_all,
    'logit_0': logits_all[:, 0],
    'logit_1': logits_all[:, 1],
})

# Merge predictions back to the original test dataframe (which still has row_id col)
# Use left join on row_id to preserve original input order, then sort by row_id if needed
merged = test[['ID']].merge(pred_df, on='ID', how='left')

# Some sanity checks
if merged[['logit_0', 'logit_1']].isnull().any().any():
    # if any missing predictions, fill with zeros (shouldn't happen)
    merged[['logit_0', 'logit_1']] = merged[['logit_0', 'logit_1']].fillna(0.0)
    print("Warning: some rows missing predictions and were filled with zeros.")

logits_tensor = torch.as_tensor(merged[['logit_0', 'logit_1']].values, dtype=torch.float32)
pred_probs = F.softmax(logits_tensor, dim=-1)
pred_labels = torch.argmax(pred_probs, dim=-1).numpy()

# =====================
# 10) Cleanup + write submission
# =====================
del model_1, model_2
clean_memory()

sub = test[['ID']].copy()
# sub['prediction'] = pred_probs[:, 1].numpy()
sub['prediction'] = pred_labels
sub = sub.sort_values('ID')
sub.to_csv('sub_b.csv', index=False)

Overwriting inference.py


In [20]:
!python inference.py

2025-11-07 03:26:10.444332: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762485970.466440     260 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762485970.473132     260 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
GPU0 rows: 498 mean_len: 395.67670682730926 total_tokens: 197047
GPU1 rows: 502 mean_len: 404.5278884462151 total_tokens: 203073
Total rows covered: 1000 original: 1000
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:09<00:00,  4.52s/it]
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:08<00:00,  4.42s/it]
  0%|                                                    | 0/32 [00:00<?, ?it/s]
  0%|                    

In [21]:
%%writefile inference.py

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import gc
import ctypes
import math
import numpy as np
import pandas as pd
from threading import Thread
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torch.amp import autocast

from transformers import AutoTokenizer, AutoConfig, BitsAndBytesConfig
from peft import AutoPeftModelForFeatureExtraction

# -----------------------
# Basic environment setup
# -----------------------
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

def clean_memory(deep=True):
    gc.collect()
    if deep:
        try:
            ctypes.CDLL("libc.so.6").malloc_trim(0)
        except Exception:
            pass
    torch.cuda.empty_cache()


# =====================
# 1) Read + prepare text
# =====================
test = pd.read_parquet('/kaggle/input/sem-eval-2026-task-13-subtask-a/Task_A/test.parquet')

test_texts = []
for code in test['code']:
    code_snippet = code.strip()
    prompt = f"""<|im_start|>user
You are a professional code analyst. Your task is to decide if the following code snippet was written by an AI model. Respond only with "Yes" or "No".

Now, here is the code snippet to classify:
"{code_snippet}"

Answer "Yes" if it is AI-generated, otherwise "No".<|im_end|>
<|im_start|>assistant
<think>

</think>

Answer:"""
    test_texts.append(prompt)

test['text'] = test_texts
test['target'] = -100

# =====================
# 2) Tokenizer + lengths
# =====================

model_path = '/kaggle/input/qwen2.5-coder/transformers/1.5b/1'
    
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.padding_side = 'left'

# compute tokenized lengths (this may be slow but necessary for binning)
test['len'] = test['text'].apply(lambda x: len(tokenizer(x).input_ids))

# =====================
# 3) Bin-balanced sampling + sort within split
# =====================
# Create quantile bins by length (e.g. 10 bins). Adjust q if you want finer/coarser bins.
n_bins = 10
# qcut can produce fewer bins if duplicates exist; handle duplicates='drop'
test['len_bin'] = pd.qcut(test['len'], q=n_bins, labels=False, duplicates='drop')

# Prepare lists collecting halves from each bin
test_0_list = []
test_1_list = []

# For reproducibility
RANDOM_STATE = 42

for b in sorted(test['len_bin'].unique()):
    bin_df = test[test['len_bin'] == b].copy()
    # shuffle within bin to avoid positional bias
    bin_df = bin_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    mid = len(bin_df) // 2
    test_0_list.append(bin_df.iloc[:mid])
    test_1_list.append(bin_df.iloc[mid:])

# Concatenate per-GPU datasets and sort within each split ascending by length to optimize padding
test_0 = pd.concat(test_0_list, ignore_index=True).sort_values('len', ascending=True).reset_index(drop=True)
test_1 = pd.concat(test_1_list, ignore_index=True).sort_values('len', ascending=True).reset_index(drop=True)

# If one side is missing some remainder because of odd counts, ensure everyone covered
# Combine leftovers (rare) - but our split above already assigns all rows.
# Just sanity-check lengths & tokens
print("GPU0 rows:", len(test_0), "mean_len:", test_0['len'].mean(), "total_tokens:", int(test_0['len'].sum()))
print("GPU1 rows:", len(test_1), "mean_len:", test_1['len'].mean(), "total_tokens:", int(test_1['len'].sum()))
print("Total rows covered:", len(test_0) + len(test_1), "original:", len(test))

# =====================
# 4) Dataset + DataLoader (include original index so we can place preds correctly)
# =====================
class SemEvalDataset(Dataset):
    def __init__(self, df):
        # df must contain columns: 'text', 'target', 'ID' (or original index)
        self.texts = df['text'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.row_ids = df['ID'].to_numpy()

    def __getitem__(self, idx):
        # return prompt (string), target (int), original row id
        return self.texts[idx], int(self.targets[idx]), int(self.row_ids[idx])

    def __len__(self):
        return len(self.targets)


batch_size = 16

dataset_0 = SemEvalDataset(test_0)
dataset_1 = SemEvalDataset(test_1)

dataloader_0 = DataLoader(dataset_0, batch_size=batch_size, shuffle=False, drop_last=False)
dataloader_1 = DataLoader(dataset_1, batch_size=batch_size, shuffle=False, drop_last=False)

test_dataloaders = [dataloader_0, dataloader_1]

# =====================
# 5) Model definition
# =====================
class Net(nn.Module):
    def __init__(self, base_model_path, trained_backbone_path, load_in_device):
        super(Net, self).__init__()

        self.config = AutoConfig.from_pretrained(base_model_path)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )

        self.backbone = AutoPeftModelForFeatureExtraction.from_pretrained(
            trained_backbone_path,
            use_cache=False,
            torch_dtype=torch.float16,
            quantization_config=bnb_config,
            device_map=load_in_device
        )

        self.head = nn.Linear(self.config.hidden_size, 2, bias=False)

    def forward(self, x):
        # x is tokenized dict sent directly to backbone
        x = self.backbone(**x).last_hidden_state[:, -1, :]
        logits = self.head(x)
        return logits


# =====================
# 6) Load models on each GPU
# =====================
trained_backbone_path = '/kaggle/working/backbone_c'
trained_head_path = '/kaggle/working/head_c.pt'

model_1 = Net(base_model_path=model_path,
              trained_backbone_path=trained_backbone_path,
              load_in_device='cuda:0')

model_2 = Net(base_model_path=model_path,
              trained_backbone_path=trained_backbone_path,
              load_in_device='cuda:1')

# load head states (weights_only=True used originally; keep same behavior)
model_1.head.load_state_dict(torch.load(trained_head_path, weights_only=True))
model_2.head.load_state_dict(torch.load(trained_head_path, weights_only=True))

# ensure heads on correct device
model_1.head.to('cuda:0')
model_2.head.to('cuda:1')

# Put models into eval mode (backbone's weights are already on the correct device via device_map)
model_1.eval()
model_2.eval()

# =====================
# 7) Inference function (collect logits + row_ids to reconstruct order)
# =====================
def get_preds(model, tokenizer, dataloader, device, results_dict):
    """
    Runs inference for the provided dataloader on `device`.
    Writes to results_dict a list of (row_id_array, logits_array) tuples in order processed.
    """
    device_short = device  # e.g., 'cuda:0'
    collected_row_ids = []
    collected_logits = []

    with torch.no_grad():
        for batch in tqdm(dataloader, total=len(dataloader)):
            batch_prompts, batch_targets, batch_row_ids = batch  # prompts: list[str]
            # Tokenize batch (returns tensors on CPU)
            encodings = tokenizer(list(batch_prompts), return_tensors='pt', padding='longest', truncation=False)
            # Move to device with pin_memory + non_blocking
            encodings = {k: v.to(device_short, non_blocking=True) for k, v in encodings.items()}

            with autocast(device_type='cuda'):
                logits = model(encodings)  # should be (B, 2)

            # bring logits back to CPU and store along with row_ids
            collected_logits.append(logits.detach().cpu())
            collected_row_ids.append(torch.as_tensor(batch_row_ids))

    if len(collected_logits) > 0:
        results_dict[device_short] = (torch.cat(collected_row_ids).numpy(), torch.cat(collected_logits).numpy())
    else:
        results_dict[device_short] = (np.array([], dtype=int), np.zeros((0, 2), dtype=np.float32))


# =====================
# 8) Run threaded inference on both GPUs
# =====================
results = {}

t0 = Thread(target=get_preds, args=(model_1, tokenizer, test_dataloaders[0], 'cuda:0', results))
t1 = Thread(target=get_preds, args=(model_2, tokenizer, test_dataloaders[1], 'cuda:1', results))

t0.start()
t1.start()

t0.join()
t1.join()

# =====================
# 9) Reconstruct full logits array in original row order
# =====================
n_total = len(test)
# We'll create a logits array indexed by original row_id values.
# If row_id values are not 0..N-1, map them via a lookup.
row_ids_all = np.concatenate([results['cuda:0'][0], results['cuda:1'][0]])
logits_all = np.concatenate([results['cuda:0'][1], results['cuda:1'][1]], axis=0)

# create a mapping row_id -> logits
# If row_id is unique, we can use a dict or numpy indexing
# We'll construct a DataFrame and then join with original test rows to ensure ordering
pred_df = pd.DataFrame({
    'ID': row_ids_all,
    'logit_0': logits_all[:, 0],
    'logit_1': logits_all[:, 1],
})

# Merge predictions back to the original test dataframe (which still has row_id col)
# Use left join on row_id to preserve original input order, then sort by row_id if needed
merged = test[['ID']].merge(pred_df, on='ID', how='left')

# Some sanity checks
if merged[['logit_0', 'logit_1']].isnull().any().any():
    # if any missing predictions, fill with zeros (shouldn't happen)
    merged[['logit_0', 'logit_1']] = merged[['logit_0', 'logit_1']].fillna(0.0)
    print("Warning: some rows missing predictions and were filled with zeros.")

logits_tensor = torch.as_tensor(merged[['logit_0', 'logit_1']].values, dtype=torch.float32)
pred_probs = F.softmax(logits_tensor, dim=-1)
pred_labels = torch.argmax(pred_probs, dim=-1).numpy()

# =====================
# 10) Cleanup + write submission
# =====================
del model_1, model_2
clean_memory()

sub = test[['ID']].copy()
# sub['prediction'] = pred_probs[:, 1].numpy()
sub['prediction'] = pred_labels
sub = sub.sort_values('ID')
sub.to_csv('sub_c.csv', index=False)

Overwriting inference.py


In [22]:
!python inference.py

2025-11-07 03:28:49.980100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762486130.002718     291 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762486130.009476     291 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
GPU0 rows: 498 mean_len: 395.67670682730926 total_tokens: 197047
GPU1 rows: 502 mean_len: 404.5278884462151 total_tokens: 203073
Total rows covered: 1000 original: 1000
  0%|                                                    | 0/32 [00:00<?, ?it/s]
  0%|                                                    | 0/32 [00:00<?, ?it/s][A
  3%|█▍                                          | 1/32 [00:00<00:28,  1.10it/s][A
  6%|██▊           

# Ensemble

In [25]:
sub_a = pd.read_csv('sub_a.csv')
sub_b = pd.read_csv('sub_b.csv')
sub_c = pd.read_csv('sub_c.csv')

sub = sub_a[['ID']].copy()

sub['prediction'] = (
    0.5*sub_a['prediction'] + 0.3*sub_b['prediction'] + 0.2*sub_c['prediction']
)
sub['prediction'] = (sub['prediction'] >= 0.5).astype(int)
sub.to_csv('submission.csv', index=False)

In [27]:
pred = pd.read_csv('submission.csv')
pred['prediction'].value_counts()

prediction
0    641
1    359
Name: count, dtype: int64