### **安装包**

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]
!pip install sentencepiece

### **导入需要的包**

In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Dataset
from transformers import DataCollatorForSeq2Seq
from transformers import T5Tokenizer
from torch.utils.data import TensorDataset

### **加载数据**

In [3]:
import json

# 指向数据集文件的路径
dataset_path = "/content/drive/MyDrive/CSpider"

# 读取训练数据
with open(f"{dataset_path}/train.json", "r", encoding="utf-8") as train_file:
    train_data = json.load(train_file)

# 读取验证数据
with open(f"{dataset_path}/dev.json", "r", encoding="utf-8") as validation_file:
    validation_data = json.load(validation_file)

# 现在你可以使用 train_data 和 validation_data 进行进一步的处理，以适应你的项目需求。


### **数据集处理**

In [4]:
# 初始化 T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("tscholak/2jrayxos")

# 准备微调数据
formatted_data = []
formatted_validation_data = []

for example in train_data:
    inputs = example["question"]
    targets = " ".join(example["sql"])  # 将SQL查询字符串列表合并为一个字符串
    # input_encodings = tokenizer("question: " + inputs, truncation=True, padding="max_length", return_tensors="pt")
    input_encodings = tokenizer(inputs, truncation=True, padding="max_length", max_length=64, return_tensors="pt")
    target_encodings = tokenizer(targets, truncation=True, padding="max_length", max_length=64, return_tensors="pt")

    formatted_data.append({
        "input_ids": input_encodings.input_ids[0],
        "attention_mask": input_encodings.attention_mask[0],
        "labels": target_encodings.input_ids[0],
    })


for example in validation_data:
    inputs = example["question"]
    targets = " ".join(example["sql"])  # 将SQL查询字符串列表合并为一个字符串
    input_encodings = tokenizer("question: " + inputs, targets, truncation=True, padding="max_length", max_length=64, return_tensors="pt")
    target_encodings = tokenizer(targets, truncation=True, padding="max_length", return_tensors="pt")

    formatted_validation_data.append({
        "input_ids": input_encodings.input_ids[0],
        "attention_mask": input_encodings.attention_mask[0],
        "labels": target_encodings.input_ids[0],
    })

train_dataset = Dataset.from_dict({
    "input_ids": [item["input_ids"] for item in formatted_data],
    "attention_mask": [item["attention_mask"] for item in formatted_data],
    "labels": [item["labels"] for item in formatted_data]
}, split="train")

validation_dataset = Dataset.from_dict({
    "input_ids": [item["input_ids"] for item in formatted_validation_data],
    "attention_mask": [item["attention_mask"] for item in formatted_validation_data],
    "labels": [item["labels"] for item in formatted_validation_data]
}, split="validation")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# # 初始化 T5 tokenizer
# tokenizer = T5Tokenizer.from_pretrained("t5-small")

# # 将数据转换为适合 T5 模型的格式
# formatted_data = []
# labels = []  # 存储目标 SQL 查询的列表

# formatted_validation_data = []
# validation_labels = []  # 存储验证数据的目标 SQL 查询

# for example in train_data:
#     question = example["question"]
#     query = " ".join(example["query_toks"])
#     formatted_data.append(f"question: {question} context: {query}")
#     # 添加目标 SQL 查询到 labels 列表
#     # label.append(example["sql"])
#     sql_query = example["sql"]
#     # 将 SQL 查询字典转换为字符串
#     sql_query_str = ' '.join(map(str, sql_query))
#     labels.append(sql_query_str)

# for example in validation_data:
#     question = example["question"]
#     query = " ".join(example["query_toks"])
#     formatted_validation_data.append(f"question: {question} context: {query}")

#     sql_query = example["sql"]
#     sql_query_str = ' '.join(map(str, sql_query))
#     validation_labels.append(sql_query_str)


# print(train_data[:10])
# print(formatted_data[:10])
# print(labels[:10])  # 打印前10个 SQL 查询作为 labels

# print(validation_data[:10])
# print(formatted_validation_data[:10])
# print(validation_labels[:10])  # 打印前10个 SQL 查询作为验证集的 labels

### **编码数据**

In [6]:
# # 编码数据
# input_ids = tokenizer(formatted_data, return_tensors="pt", padding=True, truncation=True)

# # 编码验证数据
# validation_input_ids = tokenizer(formatted_validation_data, return_tensors="pt", padding=True, truncation=True)

# print(input_ids)

# # 编码 SQL 查询
# # labels = tokenizer(label, return_tensors="pt", padding=True, truncation=True)
# labels = tokenizer(labels, return_tensors="pt", padding=True, truncation=True)

# validation_labels = tokenizer(validation_labels, return_tensors="pt", padding=True, truncation=True)

# print(labels[:10])

In [7]:
# # 确保数据包含 input_ids 和 attention_mask 字段

# # 创建训练数据集
# train_dataset = {
#     "input_ids": input_ids["input_ids"],
#     "attention_mask": input_ids["attention_mask"],
#     "labels": labels,  # T5 模型要求预测标签与输入相同
# }


# # 创建验证数据集
# validation_dataset = {
#     "input_ids": validation_input_ids["input_ids"],
#     "attention_mask": validation_input_ids["attention_mask"],
#     "labels": validation_labels["input_ids"],
# }

# for i, (key, value) in enumerate(train_dataset.items()):
#     if i >= 10:
#         break
#     print(f"{key}: {value}")
# # train_dataset 现在包含了适合微调 T5 模型的数据

### **加载模型**

In [8]:
# 加载微调所需的模型和tokenizer
model_name = "tscholak/2jrayxos"  # 替换为你的预训练模型的名称
tokenizer_name = "tscholak/2jrayxos"  # 替换为你的预训练模型的tokenizer名称
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/784 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

### **数据编码**

### **定义数据收集器**

In [9]:
# # 定义数据收集器
# data_collator = DataCollatorForSeq2Seq(
#     tokenizer,
#     model=model,
# )
data_collator = DataCollatorForSeq2Seq(tokenizer)


### **定义微调参数**

In [10]:
# # 定义微调参数
training_args = Seq2SeqTrainingArguments(
    output_dir="./cspider_finetuned_model",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
    learning_rate=1e-4,  # 调整学习率
)


### **初始化微调器**

In [11]:
# 初始化微调器
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)


In [12]:
# print(train_dataset)
print(len(train_dataset))
print(len(validation_dataset))

8659
1034


In [13]:
# 开始微调
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


OutOfMemoryError: ignored

In [None]:

# 保存微调后的模型
trainer.save_model("./cspider_finetuned_model")