In [None]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [None]:
df_wsdm = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet")
print(df_wsdm.shape)
df_wsdm.head(5)

### LMSYS 57k

In [None]:
df_lmsys = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
print(df_lmsys.shape)
import pandas as pd
import json

def process_lmsys_df(df_lmsys):
    for col in ["prompt", "response_a", "response_b"]:
        df_lmsys[col] = df_lmsys[col].apply(lambda x: json.loads(x))
    df_lmsys["winner"] = df_lmsys[["winner_model_a", "winner_model_b", "winner_tie"]].values.argmax(axis=1)
    df_lmsys["turn"] = df_lmsys["prompt"].apply(lambda x: len(x))
    df_lmsys.drop(columns=["winner_model_a", "winner_model_b", "winner_tie"], inplace=True)
    for col in ["prompt", "response_a", "response_b"]:
        df_lmsys[col] = df_lmsys[col].apply(lambda x: " ".join([i for i in x if i is not None]))
    df_lmsys = df_lmsys[df_lmsys["winner"] != 2].copy()
    df_lmsys["winner"] = df_lmsys["winner"].apply(lambda x: "model_a" if x == 0 else "model_b")
    return df_lmsys
df_lmsys = process_lmsys_df(df_lmsys)
print(df_lmsys.shape)
df_lmsys.head()

### UT 157k

In [None]:
df_ut = pd.read_parquet("/kaggle/input/llm-human-preference-data-ultrafeedback/ultrafeedback.parquet")
print(df_ut.shape)
df_ut.head()

In [None]:
print("Chosen-rating distribution:")
print(df_ut["chosen-rating"].value_counts())
print("\nRejected-rating distribution:")
print(df_ut["rejected-rating"].value_counts())
df_ut["chosen-rating-rejected-rating"] = df_ut["chosen-rating"]-df_ut["rejected-rating"]
print(df_ut["chosen-rating-rejected-rating"].value_counts())
print(df_ut["chosen-rating-rejected-rating"].mean())

In [None]:
df_ut = pd.read_parquet("/kaggle/input/llm-human-preference-data-ultrafeedback/ultrafeedback.parquet")
print(df_ut.shape)

random.seed(0)
def process_ut_df(df_ut):
    labels, texts = [], []
    for _, row in tqdm(df_ut.iterrows() ,total=len(df_ut)):

        if not (
            (row["chosen-rating"] - row["rejected-rating"] >= 1.50)
        ):
            continue  
        chosen = row["chosen"]
        rejected = row["rejected"]
        assert len(chosen) == 2
        assert len(rejected) == 2
        assert rejected[0] == chosen[0]
        
        prompt = chosen[0]['content']
        response_a = chosen[1]["content"]
        response_b = rejected[1]["content"]
        model_a = row["chosen-model"]
        model_b = row["rejected-model"]
        
        if random.random() > 0.5:
            winner="model_a"
        else:
            winner="model_b"
            response_a, response_b = response_b, response_a
            model_a, model_b = model_b, model_a

        labels.append(winner)
        texts.append((model_a, model_b, prompt, response_a, response_b))
    labels_df = pd.DataFrame(labels, columns=["winner"])
    texts_df = pd.DataFrame(texts, columns=["model_a", "model_b", "prompt", "response_a", "response_b"]).astype(str)
    merge_df = pd.concat([texts_df, labels_df], axis=1)
    merge_df["id"] = [f"ultrachat_{i:05}" for i in range(len(merge_df))]
    merge_df["turn"]=1
    return merge_df
    
df_ut = process_ut_df(df_ut)
print(df_ut.shape)
df_ut.head()

### additional-33k-labelled-conversations

In [None]:
df_add = pd.read_csv("/kaggle/input/lmsys-additional-33k-labelled-conversations/lmsys-33k.csv")
print(df_add.shape)
df_add = process_lmsys_df(df_add)
print(df_add.shape)
df_add.head()

### orpo-dpo-mix-40k

In [None]:
from datasets import load_dataset
dataset = load_dataset('mlabonne/orpo-dpo-mix-40k', split='train')
df_dpo = dataset.to_pandas()
print(df_dpo.shape)
df_dpo.head()

In [None]:
def process_dpo_df(df_ut):
    labels, texts = [], []
    for _, row in tqdm(df_ut.iterrows() ,total=len(df_ut)):
        chosen = row["chosen"]
        rejected = row["rejected"]
        
        assert len(chosen) == len(rejected)
    
        prompt = [i['content'] for i in chosen if i["role"]=="user"]
        response_a = [i['content'] for i in chosen if i["role"]=="assistant"]
        response_b = [i['content'] for i in rejected if i["role"]=="assistant"]
        model_a = "unknown"
        model_b = "unknown"
        
        if random.random() > 0.5:
            winner="model_a"
        else:
            winner="model_b"
            response_a, response_b = response_b, response_a
            model_a, model_b = model_b, model_a

        labels.append(winner)
        texts.append((model_a, model_b, prompt, response_a, response_b))
    labels_df = pd.DataFrame(labels, columns=["winner"])
    texts_df = pd.DataFrame(texts, columns=["model_a", "model_b", "prompt", "response_a", "response_b"])
    merge_df = pd.concat([texts_df, labels_df], axis=1)
    merge_df["id"] = [f"orpo-dpo-mix_{i:05}" for i in range(len(merge_df))]
    merge_df["turn"] = merge_df["prompt"].apply(lambda x: len(x))
    for col in ["prompt", "response_a", "response_b"]:
        merge_df[col] = merge_df[col].apply(lambda x: " ".join([i for i in x if i is not None]))
    return merge_df
    
df_dpo = process_dpo_df(df_dpo)
print(df_dpo)
df_dpo.head()

## Filtering Data

In [None]:
# df_ut.to_parquet("ut_157k.parquet", index=False)
# df_add.to_parquet("lmsys_39k.parquet", index=False)
# df_add.to_parquet("add_23k.parquet", index=False)
# df_dpo.to_parquet("dpo_44k.parquet", index=False)

In [None]:
df_lmsys = df_lmsys.copy()
df_add = df_add.copy()
df_ut = df_ut.copy()
df_dpo = df_dpo.copy()
df_lmsys['id'] = ['lmsys_' + str(i) for i in range(1, len(df_lmsys) + 1)]
df_add['id'] = ['add_' + str(i) for i in range(1, len(df_add) + 1)]
df_ut['id'] = ['ut_' + str(i) for i in range(1, len(df_ut) + 1)]
df_dpo['id'] = ['dpo_' + str(i) for i in range(1, len(df_dpo) + 1)]

superset = pd.concat([df_lmsys, df_add, df_ut, df_dpo]).reset_index(drop=True)
print(len(superset))
superset = superset.drop_duplicates(subset=['prompt', 'response_a', 'response_b',], keep='last').reset_index(drop=True)
print(len(superset))
superset.head()

In [None]:
import pandas as pd
import numpy as np

# 1. 计算每行的长度（prompt, response_a, response_b 总长度）
superset['total_length'] = superset['prompt'].apply(len) + superset['response_a'].apply(len) + superset['response_b'].apply(len)

# 2. 随机打乱数据
superset = superset.sample(frac=1, random_state=42).reset_index(drop=True)

# 3. 分配 batch，确保无数据丢失
batch_size = 32  # 设置 batch 大小

# 按 total_length 对数据排序
superset = superset.sort_values(by='total_length').reset_index(drop=True)

# 4. 创建批次
batches = []
num_batches = len(superset) // batch_size

for i in range(num_batches):
    batch = superset.iloc[i * batch_size: (i + 1) * batch_size]
    batches.append(batch)

# 处理剩余数据（直接追加）
remaining_data = superset.iloc[num_batches * batch_size:]
if not remaining_data.empty:
    print(f"Adding {len(remaining_data)} remaining rows to the final dataset.")
    batches.append(remaining_data)

# 5. 合并所有批次并打乱
balanced_superset = pd.concat(batches, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

# 6. 打印结果
print(f"Original length: {len(superset)}")
print(f"Balanced length: {len(balanced_superset)}")
balanced_superset.head()

In [None]:
import matplotlib.pyplot as plt

# 1. 画出 total_length 的趋势图
plt.figure(figsize=(12, 6))
plt.plot(balanced_superset.index, balanced_superset['total_length'], color='blue', linewidth=1)

# 2. 添加标题和标签
plt.title("Trend of Total Length in Balanced Superset", fontsize=14)
plt.xlabel("Index", fontsize=12)
plt.ylabel("Total Length", fontsize=12)

# 3. 显示网格
plt.grid(True)

# 4. 显示图表
plt.show()

In [None]:
balanced_superset['id'] = balanced_superset['id'].astype(str)
balanced_superset = balanced_superset.drop(columns=['total_length'])
balanced_superset.to_parquet("all_extra_161k.parquet", index=None)
balanced_superset

In [None]:
print((superset['winner'] == 'model_a').sum())
print((superset['winner'] == 'model_b').sum())

In [None]:
!ls