# Training a causal language model from scratch (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [4]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs
!cd /workspace/matmulfreellm
!pip install -e .

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
[1;31mE: [0mUnable to locate package git-lfs[0m


You will need to setup git, adapt your email and name in the following cell.

In [10]:
!git config --global user.email "zhongwei.xie@hotmail.com"
!git config --global user.name "zhongweixie"


You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [8]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13
[0m

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [3]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
    any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)

False True


In [13]:
!pip install datasets

[0m

In [4]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset


def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

In [5]:
# This cell will take a very long time to execute, so you should skip it and go to
# the next one!
from datasets import load_dataset

split = "train"  # "valid"
filters = ["text", "meta"]
#data = load_dataset(f"/siflow/cerebras/SlimPajama-627B/", split=split, streaming=True)

data = load_dataset(f"/siflow/cerebras/SlimPajama-627B/{split}/chunk1/", split=split, streaming=True)
print(data)

Resolving data files:   0%|          | 0/5912 [00:00<?, ?it/s]

IterableDataset({
    features: Unknown,
    n_shards: 5912
})


In [6]:
from functools import partial
from data_loader import *

device = 'cuda:0'
device_type = 'cuda'
batch_size =64
max_seq_len = 350
DATA_CACHE_DIR = "/siflow/cerebras/SlimPajama-627B/train/chunk1/"

In [7]:
iter_batches = partial(
    iter_batch_func,
    device=device,
    batch_size=batch_size,
    max_seq_len=max_seq_len,
    data_cache_dir=DATA_CACHE_DIR
)

In [10]:
iter_batches

functools.partial(<function iter_batch_func at 0x148982b8f640>, device='cuda:0', batch_size=64, max_seq_len=350, data_cache_dir='/siflow/cerebras/SlimPajama-627B/train/chunk1/')

In [202]:
# This cell will take a very long time to execute, so you should skip it and go to
# the next one!
from datasets import load_dataset, DatasetDict

split0 = "train"  # "valid"
split1 = "validation"
filters = ["text", "meta"]

ds_train = load_dataset(f"/siflow/cerebras/SlimPajama-627B/{split0}/chunk1/", split=split,streaming=True)
ds_valid = load_dataset(f"/siflow/cerebras/SlimPajama-627B/{split1}/chunk1/", split=split,streaming=True)
raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

print(type(raw_datasets["train"]))
raw_datasets

Resolving data files:   0%|          | 0/5913 [00:00<?, ?it/s]

DatasetDict({
    train: IterableDataset({
        features: Unknown,
        n_shards: 5913
    })
    valid: IterableDataset({
        features: Unknown,
        n_shards: 2
    })
})

In [15]:
from datasets import load_dataset, DatasetDict

print(dir(raw_datasets["train"]))

['__abstractmethods__', '__add__', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_distributed', '_effective_generator', '_epoch', '_ex_iterable', '_formatting', '_head', '_info', '_is_main_process', '_is_protocol', '_iter_pytorch', '_prepare_ex_iterable_for_iteration', '_prepared_ex_iterable', '_resolve_features', '_shuffling', '_split', '_starting_state_dict', '_state_dict', '_step', '_token_per_repo_id', 'add_column', 'batch', 'builder_name', 'cast', 'cast_column', 'citation', 'column_names', 'config_name', 'dataset_size', 'descript

In [20]:
print(raw_datasets["train"])

IterableDataset({
    features: Unknown,
    n_shards: 5912
})


In [21]:
!pip install pandas zstandard

Collecting zstandard
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: zstandard
Successfully installed zstandard-0.23.0
[0m

In [214]:
import zstandard
import json
import io

def decompress_zst_to_json(zst_file_path):
    dctx = zstandard.ZstdDecompressor()
    with open(zst_file_path, 'rb') as compressed:
        with dctx.stream_reader(compressed) as reader:
            text_stream = io.TextIOWrapper(reader, encoding='utf-8')
            data_list = [json.loads(line) for line in text_stream]
    return data_list

In [209]:
import torch
from torch.utils.data import IterableDataset

class JsonIterableDataset(IterableDataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __iter__(self):
        for data in self.data_list:
            yield data

    def __getitem__(self, idx):
        return self.data[idx]
    
    def __len__(self):
        # 返回数据集的大小，如果未知，可以返回 None 或者一个估计值
        return len(self.data_list)


In [28]:
def process_folder(folder_path):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.zst'):
            file_path = os.path.join(folder_path, filename)
            data = decompress_zst_to_json(file_path)
            all_data.extend(data)
    return all_data

In [215]:
def process_specific_files(folder_path, num_files):
    all_data = []
    for i in range(num_files):
        #example_train_0.jsonl.zst
        filename = f"example_train_{i}.jsonl.zst"
        #filename = f"example_holdout_{i}.jsonl.zst"
        file_path = os.path.join(folder_path, filename)
        if os.path.exists(file_path):
            data = decompress_zst_to_json(file_path)
            all_data.extend(data)
        else:
            print(f"File not found: {file_path}")
    return all_data

In [217]:
from torch.utils.data import DataLoader
from datasets import Dataset

# 假设 `data_list` 是从 `.zst` 文件解压并解析后得到的 JSON 数据列表
all_data = process_specific_files(f"/siflow/cerebras/SlimPajama-627B/{split}/chunk1/",1)
valid_data = process_specific_files(f"/siflow/cerebras/SlimPajama-627B/validation/chunk1/",1)

    # 创建数据集和数据加载器
dataset = JsonIterableDataset(all_data)
valid_dataset =JsonIterableDataset(valid_data)
# data_list = decompress_zst_to_json('/siflow/cerebras/SlimPajama-627B/validation/chunk1/example_holdout_0.jsonl.zst')
# dataset = JsonIterableDataset(data_list)
dataloader = DataLoader(dataset, batch_size=32)
print(type(dataloader))


In [None]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [66]:
!pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet
Successfully installed chardet-5.2.0
[0m

In [267]:
jsonl_path = '/siflow/cerebras/SlimPajama-627B/validation/chunk1/example_train_0.jsonl'

# zst_path ='/siflow/cerebras/SlimPajama-627B/train/chunk1/example_train_0.jsonl.zst'
# decompress_zst_to_json(zst_path, jsonl_path)
encoding = 'big5'
common_encodings = ['utf-8', 'ascii', 'latin1', 'iso-8859-1', 'utf-16', 'utf-32', 'gbk', 'gb2312', 'big5']
# for encoding in common_encodings:
#     with open(jsonl_path, 'r',encoding=encoding) as f:
#         data = [json.loads(line) for line in f]
#         print(data)

In [73]:
!pip install cchardet
import cchardet
def detect_encoding_large_file(file_path, sample_size=4096):
    with open(file_path, 'rb') as f:
        raw_data = f.read(sample_size)
    result = cchardet.detect(raw_data)
    encoding = result.get('encoding')
    return encoding

file_path = '/siflow/cerebras/SlimPajama-627B/train/chunk1/example_train_0.jsonl.zst'
encoding = detect_encoding_large_file(file_path)
print(f'The encoding of the file is: {encoding}')

[0mThe encoding of the file is: None


In [79]:
def try_encodings(file_path, encodings):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    for encoding in encodings:
        try:
            decoded_data = raw_data.decode(encoding)
            print(f'Success: {file_path} can be decoded with {encoding}')
            print(decoded_data[:500])  # 打印前500个字符进行检查
            return encoding
        except UnicodeDecodeError:
            print(f'Failed: {file_path} cannot be decoded with {encoding}')
    return None

# 常见的编码列表
common_encodings = ['utf-8', 'ascii', 'latin1', 'iso-8859-1', 'utf-16', 'utf-32', 'gbk', 'gb2312', 'big5']

# 文件路径
file_path = '/siflow/cerebras/SlimPajama-627B/train/chunk1/example_train_0.jsonl.zst'

# 尝试不同的编码
encoding = try_encodings(file_path, common_encodings)
if encoding:
    print(f'The encoding of the file is likely: {encoding}')
else:
    print('No encoding worked. The file may not be text, or it may be encoded in an uncommon format.')

Failed: /siflow/cerebras/SlimPajama-627B/train/chunk1/example_train_0.jsonl.zst cannot be decoded with utf-8
Failed: /siflow/cerebras/SlimPajama-627B/train/chunk1/example_train_0.jsonl.zst cannot be decoded with ascii
Success: /siflow/cerebras/SlimPajama-627B/train/chunk1/example_train_0.jsonl.zst can be decoded with latin1
èsCÆåCÚí¿ÇêÖ2ÛÔ)dúüû¥\ù;Þªß¯hÁQá>tsðò*Å`§ÑT?+ó÷à¿6÷Ï{dÚ·ÜÈýÉµTçYiR^¡íYwuÏZO¯>¹ÖuÂPÇ­tª©f¿ú¾S¨!We
ZxxX@`Â ZUó_ÖîXbw¤u[:Ö
dÚ6ÿ¹á
È;ì@¡Ôd§q@¦AÂ  
Õ±½¾r¯v.ºj5´Ñºäxñyö-7µ}ZêO+/Åpp4ÈÐ(¼
The encoding of the file is likely: latin1


In [130]:
dataloader = DataLoader(dataset, batch_size=32)
print(type(dataloader))
flag = True
for batch in dataloader:
    # 处理你的数据
    if(flag):
        #print(batch)
        print(type(batch))
#print(batch)

<class 'torch.utils.data.dataloader.DataLoader'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class

In [270]:
from transformers import AutoTokenizer

context_length = 32000
tokenizer = AutoTokenizer.from_pretrained("/workspace/mistral-7B-PoSE-32k",trust_remote_host=True)

outputs = tokenizer(
    #raw_datasets["train"][:2]["content
    batch['text'][0],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

OSError: Incorrect path_or_model_id: '/workspace/mistral-7B-PoSE-32k'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [256]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("/workspace/code-search-net-tokenizer",trust_remote_host=True)

outputs = tokenizer(
    #raw_datasets["train"][:2]["content
    batch['text'][0],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")



In [257]:
from transformers import AutoTokenizer
from torch.utils.data import IterableDataset

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("/workspace/code-search-net-tokenizer",trust_remote_host=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# 定义分词函数
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# 创建分词后的 IterableDataset
class TokenizedIterableDataset(IterableDataset):
    def __init__(self, iterable_dataset):
        self.iterable_dataset = iterable_dataset

    def __iter__(self):
        for sample in self.iterable_dataset:
            tokenized_sample = tokenize_function(sample)
            yield tokenized_sample
    def __len__(self):
        # 你可以返回一个固定的大小，或者 None
        # 如果数据集的大小是未知的，返回 None
        return len(self.iterable_dataset)


# 创建分词后的数据集实例
tokenized_datasets = TokenizedIterableDataset(iterable_dataset=dataset)
tokenized_valid_datasets = TokenizedIterableDataset(iterable_dataset=valid_dataset)
# 迭代分词后的数据集
# for tokenized_sample in tokenized_datasets:
#     print(tokenized_sample)

In [174]:
flag = True
for tokenized_sample in tokenized_datasets:
    if flag:
        print(tokenized_sample)
    else:
        continue
    flag =False

{'input_ids': [42, 14, 42, 14, 393, 6869, 516, 734, 2578, 5335, 8898, 18190, 269, 20473, 755, 4321, 26, 25890, 7766, 410, 56, 7, 173, 6651, 15, 1580, 15, 12237, 6374, 26, 2137, 6896, 2162, 17295, 438, 979, 3086, 12, 11991, 173, 2096, 302, 311, 256, 1367, 14, 42, 14, 173, 3061, 46431, 31316, 4568, 89, 173, 11643, 26, 1163, 26, 543, 249, 14, 77, 14, 2162, 41229, 928, 8966, 292, 256, 42294, 26503, 12, 333, 20473, 755, 4321, 2, 309, 8826, 2652, 542, 256, 656, 1241, 69, 221, 1170, 296, 333, 40718, 410, 56, 2, 664, 371, 7372, 14, 4489, 12, 13088, 14, 173, 20473, 755, 4321, 26, 25890, 7766, 410, 56, 300, 11312, 296, 4110, 517, 1872, 36486, 4489, 12, 13088, 14, 22381, 14, 14493, 14, 1026, 15, 82, 2850, 81, 3794, 40, 56, 7288, 173, 49268, 41419, 755, 4321, 308, 32, 5900, 87, 4321, 9, 1427, 39678, 3086, 12, 11991, 173, 2096, 11322, 1442, 461, 1367, 14, 42, 14, 393, 6869, 516, 1184, 633, 20660, 256, 643, 562, 311, 333, 20473, 755, 4321, 2, 802, 516, 461, 333, 2096, 11322, 393, 910, 731, 83, 7204,

In [240]:
import sys
sys.path.append('/workspace/matmulfreellm/')    #先加入绝对路径，否则会报错，注意__file__表示的是当前执行文件的路径
print(sys.path)

In [258]:
from mmfreelm.models import HGRNBitConfig
from transformers import AutoModelForCausalLM

# Config for the 370M model
# Reference: https://huggingface.co/ridger/MMfreeLM-370M/blob/main/config.json
config_params = {
    "attn_mode": "fused_recurrent",
    "bos_token_id": 1,
    "conv_size": 4,
    "eos_token_id": 2,
    "expand_ratio": 1,
    "fuse_cross_entropy": True,
    "hidden_act": "swish",
    "hidden_ratio": 4,
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": None,
    "max_position_embeddings": 2048,
    "model_type": "hgrn_bit",
    "num_heads": 1,
    "num_hidden_layers": 24,
    "rms_norm_eps": 1e-06,
    "share_conv_kernel": True,
    "tie_word_embeddings": False,
    "torch_dtype": "bfloat16",
    "transformers_version": "4.40.2",
    "use_cache": True,
    "use_lower_bound": True,
    "use_short_conv": False,
    "vocab_size": 32000,
}

config = HGRNBitConfig(**config_params)
model = AutoModelForCausalLM.from_config(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"Matmul-free size: {model_size/1000**2:.1f}M parameters")

In [259]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [260]:
# out = data_collator([tokenized_datasets[i] for i in range(5)])
# for key in out:
#     print(f"{key} shape: {out[key].shape}")

NotImplementedError: Subclasses of Dataset should implement __getitem__.

In [194]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [254]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True


In [269]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="matmul_checkpoints",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=4e-3,
    save_steps=5_000,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_valid_datasets,
    #optimizers=
    # train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["valid"],
)



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [245]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'


In [246]:
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.push_to_hub()

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="huggingface-course/codeparrot-ds", device=device
)

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
plt.scatter(x, y)

# create scatter

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
df = pd.DataFrame({'x': x, 'y': y})
df.insert(0,'x', x)
for

In [None]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
profession = df.groupby(['profession']).mean()

# compute the

In [None]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
rf = RandomForestRegressor(n_estimators=300, random_state=random_state, max_depth=3)
rf.fit(X, y)
rf

In [None]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

'Keyword has not single token: testtest'

In [None]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [None]:
from torch.utils.data.dataloader import DataLoader

tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)

In [None]:
weight_decay = 0.1


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [None]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [None]:
model = GPT2LMHeadModel(config)

In [262]:
from transformers import STEOptimizer

optimizer = STEOptimizer(
    model.parameters(),
    lr=1e-5,
    weight_decay=1e-2,
    beta1=0.9,  # 类似于Adam优化器的超参数
    beta2=0.999,  # 类似于Adam优化器的超参数
    epsilon=1e-8,  # 避免分母为零
    num_epochs=3,  # 训练的总轮数
    dataloader=train_dataloader,  # 训练数据加载器
    anneal_fn=lambda x: x  # 退火函数，控制学习率随时间的变化
)


ImportError: cannot import name 'STEOptimizer' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codeparrot-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'sgugger/codeparrot-ds-accelerate'

In [None]:
output_dir = "codeparrot-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
evaluate()

(10.934126853942871, 56057.14453125)

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5_000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    "lr": get_lr(),
                    "samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )