# 微调实战1 - 训练指定格式和字段的输出

## 环境配置
- 1.安装所需的库
- 2.装载谷歌云盘
- 3.引入环境变量

In [None]:
!pip install load_dotenv ipywidgets pandas sentencepiece transformers datasets python-dotenv
!pip install --upgrade openai
!apt-get install -y libomp-dev

from google.colab import drive
drive.mount('/content/drive')

import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv('/content/drive/MyDrive/Colab/.env')

openai.api_key  = os.environ['OPENAI_API_KEY']

## 定义功能函数
- 1.初始化翻译函数
- 2.构造数据
- 3.将数据处理成jsonl格式

In [2]:
import pandas as pd
import json
import random
from datasets import load_dataset

In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-zh"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return translated_text[0]


def export_jsonl(list,count):
    df = pd.DataFrame(list)
    sampled_df = df.sample(n=min(count, len(df)), replace=False)
    formatted_data_new = []

    for _, row in sampled_df.iterrows():
        formatted_row_new = {
            "messages": [
                {
                    "role": "system",
                    "content": "As a content moderator, your role involves assessing user feedback."
                },
                {
                    "role": "user",
                    "content": translate(row["sentence"]) if _ < count / 2 else row["sentence"]
                },
                {
                    "role": "assistant",
                    "content": json.dumps({
                        "aspect": row["aspect"],
                        "sentiment": row["sentiment"],
                        "category": row["category"]
                    })
                }
            ]
        }
        formatted_data_new.append(formatted_row_new)

    return "\n".join(json.dumps(item) for item in formatted_data_new)

def save_jsonl(jsonl_data,file_path,count):
    jsonl_string = export_jsonl(jsonl_data,count)
    with open(file_path, 'w') as file:
        file.write(jsonl_string)

### 按OpenAI的格式要求，存储数据文件
- 1.训练集 **format_train.jsonl**
- 2.测试集 **format_test.jsonl**

In [20]:
dataset = load_dataset('eastwind/semeval-2016-absa-reviews-english-translated-stanford-alpaca')
data_train = list(dataset['train'])
data_test = list(dataset['test'])

train_file_path = '/content/drive/MyDrive/openai/data/format/format_train.jsonl'
test_file_path = '/content/drive/MyDrive/openai/data/format/format_test.jsonl'

# 训练数据 参数分别为 训练数据来源，导入文件路径，需要抽取的数据总量，需要翻译成中文的数据量
save_jsonl(data_train, train_file_path,1000)
save_jsonl(data_test, test_file_path,100)

## 上传数据到OpenAI服务器
- 1.通过sdk，传入本地路径，完成上传
- 2.从sdk接口返回值，获得服务器文件信息，包含file_id等必要信息。

In [None]:
from openai import OpenAI
client = OpenAI()
train_file_info = client.files.create(
    file=open(train_file_path, "rb"),
    purpose='fine-tune'
)
test_file_info = client.files.create(
    file=open(test_file_path, "rb"),
    purpose='fine-tune'
)
print(train_file_info.status)
print(test_file_info.status)


### 通过sdk创建模型对象
- 1.训练集和测试集的file_id由上一步运行sdk获得。
- 2.不要多次运行，每次运行都会创建一个云端任务，一组数据有一个云端任务即可。
- 3.如果创建了多个，也可以在playground中手动删除。

In [None]:
file_tuning_job = client.fine_tuning.jobs.create(
    training_file=train_file_info.id,
    validation_file=test_file_info.id,
    model="gpt-3.5-turbo"
)
print(file_tuning_job.status)

validating_files


### 查看微调信息（进度、模型等信息）
- 训练时长在几分钟到一天之间，通过这个接口可以查询目前的训练进度。
- 训练完成后，开发者邮箱也会收到邮件提醒。

In [None]:
file_info = client.fine_tuning.jobs.retrieve(file_tuning_job.id)
print(file_info.status)

succeeded


## 测试模型
- 可以直接通过chat.completions接口，正常测试训练好的模型。
- 或者在playground可视化测试

In [None]:
completion = client.chat.completions.create(
    model=file_info.fine_tuned_model,
    messages=[
        {"role": "system", "content": "As a content moderator, your role involves assessing user feedback. "},
        {"role": "user", "content": "这个酒店环境还行，服务一般"}
    ],
    temperature=0
)
print(completion.choices[0].message.content)

{"aspect": "hotel", "sentiment": "neutral", "category": "HOTEL#GENERAL"}
