# Packages

In [1]:
%pip install /kaggle/input/lmsys-packages/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/lmsys-packages/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: triton
Successfully installed triton-2.2.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install /kaggle/input/lmsys-packages/xformers-0.0.24042abc8.d20240802-cp310-cp310-linux_x86_64.whl

Processing /kaggle/input/lmsys-packages/xformers-0.0.24042abc8.d20240802-cp310-cp310-linux_x86_64.whl
Installing collected packages: xformers
Successfully installed xformers-0.0.24+042abc8.d20240802
Note: you may need to restart the kernel to use updated packages.


In [3]:
!cp -r /kaggle/input/wsdm-modules-0102/wsdm-modules-0102 human_pref

# Prepare test file

In [4]:
%%writefile prepare_test_file.py
import pandas as pd


df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet")
df["winner_model_a"] = 1
df["winner_model_b"] = 0
df = df.fillna("none")
df['prompt'] = df['prompt'].apply(lambda x:x[:6000])
df.to_parquet("test.parquet", index=False)

Writing prepare_test_file.py


In [5]:
!python prepare_test_file.py

# Inference: gemma2-9b

In [6]:
%%writefile predict_m0.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

# from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_gemma2 import Gemma2ForSequenceClassification
from human_pref.data.processors import ProcessorPAB, MIDProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device

import argparse

# 创建 ArgumentParser 对象
parser = argparse.ArgumentParser(description='这是一个示例程序。')
parser.add_argument('--csv_path', default='test.parquet', help='csv path')
parser.add_argument('--save_path', default='prob_m0.npy', help='csv path')
args = parser.parse_args()

model_name_or_path = "/kaggle/input/lmsys-pretrain-mid-pseudo-v3"
csv_path = args.csv_path

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
processor = MIDProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=False,
)
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)

# model for pipelined inference
num_hidden_layers = 42
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

model = Gemma2ForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# inv_freq clones for each device
config = model.config
dim = config.head_dim
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")


# for name, p in model.named_parameters():
#     print(name, p.device)
# for name, b in model.model.named_buffers():
#     print(name, b.device)

# pipeline parallelism with two GPUs
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            # attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# last micro-batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())

pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))

np.save(args.save_path, prob)

Writing predict_m0.py


In [7]:
# !python predict_m0.py

In [8]:
!python predict_m0.py

Loading checkpoint shards: 100%|██████████████████| 4/4 [04:55<00:00, 73.84s/it]
  0%|                                                     | 0/1 [00:00<?, ?it/s]2025-01-30 16:11:13.509549: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-30 16:11:13.509756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-30 16:11:13.698615: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.72s/it]
{'log_loss': 1.1673806825666764}


In [9]:
%%writefile make_submission_step1.py
import pandas as pd
import numpy as np

preds = np.load("prob_m0.npy")
df = pd.read_parquet("test.parquet")
df['winer_a'] = preds[:,0]
df['winer_b'] = preds[:,1]
df['abs'] = abs(preds[:,0] - preds[:,1])
df.to_parquet("test.parquet", index=False)

df_swap = df[df['abs'] < 0.8]
df_swap["response_a"], df_swap["response_b"] = df_swap["response_b"], df_swap["response_a"]
df_swap.to_parquet("test_swap.parquet", index=False)

Writing make_submission_step1.py


In [10]:
!python make_submission_step1.py

## Inference: llama3-8b

In [11]:
%%writefile predict_m3.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_llama import LlamaForSequenceClassification
from human_pref.data.processors import ProcessorPAB, MIDProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device


model_name_or_path = "/kaggle/input/llama-lmsys-pretrain-mid-pseudo-4096"
csv_path = "test_swap.parquet"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.deprecation_warnings[
    "sequence-length-is-longer-than-the-specified-maximum"
] = True
processor = MIDProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=True,
)
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)

# model for pipelined inference
num_hidden_layers = 32
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

model = LlamaForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# inv_freq clones for each device
config = model.config
dim = config.hidden_size // config.num_attention_heads
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")


# for name, p in model.named_parameters():
#     print(name, p.device)
# for name, b in model.model.named_buffers():
#     print(name, b.device)

# pipeline parallelism with two GPUs
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# last micro-batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())


pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))

np.save('prob_m3.npy', prob)

Writing predict_m3.py


In [12]:
!python predict_m3.py

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████| 4/4 [03:56<00:00, 59.04s/it]
  0%|                                                     | 0/1 [00:00<?, ?it/s]2025-01-30 16:15:37.684742: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-30 16:15:37.684818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-30 16:15:37.686591: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.73s/it]

In [13]:
%%writefile make_submission_step2.py
import pandas as pd
import numpy as np

# preds = np.load("prob_m3.npy")
df_swap = pd.read_parquet("test_swap.parquet")
preds = np.average(
    [
        df_swap[['winer_a','winer_b']].values,
        np.load("prob_m3.npy")[:, [1, 0]],
    ],
    axis=0,
    weights=[3.2, 1],
)

df = pd.read_parquet("test.parquet")

df['winner'] = (df['winer_a'] >= df['winer_b'])
df.loc[df['abs'] < 0.8, 'winner'] = (preds[:,0] > preds[:,1])
df['winner'] = df['winner'].map({True: 'model_a', False: 'model_b'})

sub = df[['id', 'winner']]
sub.to_csv("submission.csv", index=False)
print(sub.head())

Writing make_submission_step2.py


In [14]:
!python make_submission_step2.py

        id   winner
0   327228  model_b
1  1139415  model_b
2  1235630  model_a
