# RLHF

### 0. 환경 설정

In [None]:
!pip install transformers torch stable-baselines3 hf_transfer

### 1. LLM 모델 로드 및 텍스트 생성

In [None]:
from transformers import pipeline

# 파이프라인 생성 -> 텍스트 생성, gpt2 모델 사용
generator = pipeline('text-generation', model='gpt2')

def generate_text(prompt, max_length=150):
    response = generator(prompt, max_length=max_length, num_return_sequences=1)
    return response[0]['generated_text']

In [None]:
prompt = "This is sunny day, and"
print(generate_text(prompt))

### 2. 강화학습을 위한 Feedback 환경 생성

In [None]:
!pip install 'shimmy>=2.0'

In [None]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO

class ContentFeedbackEnv(gym.Env):
    def __init__(self):
        super(ContentFeedbackEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(3) # 0: 싫어요, 1: 좋아요, 2: 유해 콘텐츠 신고
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.history = []

    def step(self, action):
        if action == 1:
            reward = 1
            feedback = "Like"
        elif action == 2:
            reward = -2
            feedback = "Danger"
        else:
            reward = -1
            feedback = "Hate"

        self.history.append(feedback)

        obs = np.array([0.5])
        terminated = False
        truncated = False
        info = {}

        return obs, reward, terminated, truncated, info

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        return np.array([0.5]), {}

### 3. PPO 모델 생성 및 학습

In [None]:
env = ContentFeedbackEnv()
model = PPO("MlpPolicy", env, verbose=1)

In [None]:
past_feedback = [1, 0, 2, 1, 1, 0, 2, 1, 0, 1]
for action in past_feedback:
    env.step(action)

In [None]:
# PPO 모델 학습
model.learn(total_timesteps=10000)
# 학습된 모델 저장
model.save("rlhf_content_model")

In [None]:
# 학습된 모델 로드
model = PPO.load("rlhf_content_model")

env = ContentFeedbackEnv()
model.set_env(env)

In [None]:
prompt = "This is windy day, so"
response = generate_text(prompt)

print(response)

In [None]:
action = 1 # 0, 2

env.step(action)
model.learn(total_timesteps=10)