In [1]:
import argparse
from datasets import load_dataset, Dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
parser = argparse.ArgumentParser(description="Preprocess PokerBench for GRPO training")
parser.add_argument("--output_dataset", type=str, default="processed-poker-gto", 
                    help="Name of the output dataset on the Hub")
args = parser.parse_args([])

# Load the dataset

dataset = load_from_disk("/workdir/saved-datasets/PokerBench-Modified")

# # Process the dataset
# processed_dataset = dataset.map(
#     format_poker_data,
#     num_proc=8,
#     remove_columns=dataset["train"].column_names  # Remove original columns
# )

# # Push to the Hub
# processed_dataset.push_to_hub(args.output_dataset)
# print(f"Preprocessed dataset pushed to: {args.output_dataset}")

In [38]:
# Display first few examples from the dataset
print("Dataset sample:")
for i, example in enumerate(dataset['train'].select(range(3))):
    print(f"\nExample {i+1}:")
    print("Instruction:", example['instruction'])
    print("Output:", example['output'])

# Display dataset info
print("\nDataset info:")
print(dataset['train'])
print(dataset['test'])



Dataset sample:

Example 1:
Instruction: 

You are a specialist in playing 6-handed No Limit Texas Holdem. The following will be a game scenario and you need to make the optimal decision.

Here is a game summary:

The small blind is 0.5 chips and the big blind is 1 chips. Everyone started with 100 chips.
The player positions involved in this game are UTG, HJ, CO, BTN, SB, BB.
In this hand, your position is HJ, and your holding is [King of Diamond and Jack of Spade].
Before the flop, HJ raise 2.0 chips, and BB call. Assume that all other players that is not mentioned folded.
The flop comes King Of Spade, Seven Of Heart, and Two Of Diamond, then BB check, and HJ check.
The turn comes Jack Of Club, then BB check, HJ bet 3 chips, BB raise 10 chips, and HJ call.
The river comes Seven Of Club, then BB check.


Now it is your turn to make a move.
To remind you, the current pot size is 24.0 chips, and your holding is [King of Diamond and Jack of Spade].

Decide on an action based on the streng

In [12]:
from datasets import load_from_disk, DatasetDict
import os

# Load the dataset from local path
dataset_path = "/workdir/saved-datasets/PokerBench"
dataset = load_from_disk(dataset_path)

# Function to remove the phrase from each instruction
def remove_phrase(example):
    # Replace "Do not explain your answer." with an empty string
    modified_instruction = example["instruction"].replace("Do not explain your answer.", "")
    # Return the modified example
    return {"instruction": modified_instruction, "output": example["output"]}

# Apply the transformation to all splits in the dataset
modified_dataset = DatasetDict()
for split in dataset:
    modified_dataset[split] = dataset[split].map(
        remove_phrase,
        desc=f"Processing {split} split"
    )

# Save the entire DatasetDict to disk
output_path = "/workdir/saved-datasets/PokerBench-Modified"
modified_dataset.save_to_disk(output_path)

print(f"Modified dataset saved to {output_path}")

# Optionally, verify the changes
print("\nVerifying changes:")
for split in modified_dataset:
    original_example = dataset[split][0]["instruction"]
    modified_example = modified_dataset[split][0]["instruction"]
    
    print(f"\nOriginal instruction ends with: '{original_example[-50:]}'")
    print(f"Modified instruction ends with: '{modified_example[-50:]}'")

Saving the dataset (0/2 shards):  48%|████▊     | 270000/563200 [00:00<00:00, 1210901.94 examples/s]

Saving the dataset (2/2 shards): 100%|██████████| 563200/563200 [00:03<00:00, 165194.40 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 11000/11000 [00:00<00:00, 101254.99 examples/s]

Modified dataset saved to /workdir/saved-datasets/PokerBench-Modified

Verifying changes:

Original instruction ends with: 'o not explain your answer.
Your optimal action is:'
Modified instruction ends with: ', and actions before you. 
Your optimal action is:'

Original instruction ends with: 'o not explain your answer.
Your optimal action is:'
Modified instruction ends with: ', and actions before you. 
Your optimal action is:'





In [34]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# For models - downloads and caches automatically
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")


# Save model and tokenizer to a specific local path
local_model_path = "/workdir/pretrained-models/DeepSeek-R1-Distill-Qwen-14B"
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.27s/it]


('/workdir/pretrained-models/DeepSeek-R1-Distill-Qwen-14B/tokenizer_config.json',
 '/workdir/pretrained-models/DeepSeek-R1-Distill-Qwen-14B/special_tokens_map.json',
 '/workdir/pretrained-models/DeepSeek-R1-Distill-Qwen-14B/tokenizer.json')

In [31]:
from transformers import AutoConfig
import os

# 模型路径
model_path = os.path.abspath("/workdir/pretrained-models/DeepSeek-R1-Distill-Qwen-7B")


# 加载配置
config = AutoConfig.from_pretrained(model_path)

In [32]:
config

Qwen2Config {
  "_name_or_path": "/workdir/pretrained-models/DeepSeek-R1-Distill-Qwen-7B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 131072,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 152064
}

In [39]:
from datasets import load_from_disk, DatasetDict
import os
import random

# 设置随机种子以确保可重复性
random.seed(42)

# 加载原始数据集
dataset_path = "/workdir/saved-datasets/PokerBench-Modified"
full_dataset = load_from_disk(dataset_path)

# 创建一个新的 DatasetDict 来存储精简版数据集
small_dataset = DatasetDict()

# 计算训练集的 10%
train_size = len(full_dataset['train'])
sample_size = int(train_size * 0.1)
print(f"原始训练集大小: {train_size}")
print(f"10% 采样大小: {sample_size}")

# 随机选择 10% 的训练数据
train_indices = random.sample(range(train_size), sample_size)
small_dataset['train'] = full_dataset['train'].select(train_indices)

# 保留完整的测试集
small_dataset['test'] = full_dataset['test']

# 保存新数据集
output_path = "/workdir/saved-datasets/PokerBench-Small"
small_dataset.save_to_disk(output_path)

# 验证新数据集
print("\n新数据集信息:")
print(f"训练集大小: {len(small_dataset['train'])}")
print(f"测试集大小: {len(small_dataset['test'])}")

print(f"\n新数据集已保存到: {output_path}")

# 随机抽样几个示例进行验证
print("\n随机样本验证:")
for i, example in enumerate(small_dataset['train'].select(range(3))):
    print(f"\n示例 {i+1}:")
    print("指令: ", example['instruction'][:100] + "..." if len(example['instruction']) > 100 else example['instruction'])
    print("输出: ", example['output'])

原始训练集大小: 563200
10% 采样大小: 56320


Saving the dataset (0/1 shards):  37%|███▋      | 21000/56320 [00:00<00:00, 200297.41 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 56320/56320 [00:00<00:00, 230623.88 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 11000/11000 [00:00<00:00, 1054977.80 examples/s]


新数据集信息:
训练集大小: 56320
测试集大小: 11000

新数据集已保存到: /workdir/saved-datasets/PokerBench-Small

随机样本验证:

示例 1:
指令:  

You are a specialist in playing 6-handed No Limit Texas Holdem. The following will be a game scena...
输出:  fold

示例 2:
指令:  

You are a specialist in playing 6-handed No Limit Texas Holdem. The following will be a game scena...
输出:  bet 22

示例 3:
指令:  

You are a specialist in playing 6-handed No Limit Texas Holdem. The following will be a game scena...
输出:  raise 10





In [None]:
poker_answer_pattern = r"<answer>\s*(check|call|fold|bet\s+\d+(\.\d+)?|raise\s+\d+(\.\d+)?)\s*</answer>"
