# GPT-2 Fine-Tuning

#### This is the code I wrote at the company, but I think it would be nice to share it here, so I post it.

#### With this data, we will fine tune GPT-2 to make a sentence generation model. 

#### This code is for AI beginners.

## Step 1. Data preprocessing

#### the data contains unnecessary newlines, tags, and URLs it will be necessary to remove them before preprocessing.

In [5]:
import polars as pl
import pandas as pd
import numpy as np
import re
import itertools
from tqdm import tqdm
import pickle
import time
import copy

#### 読み込むデータパスの指定

In [2]:
base_path = "/kaggle/s3storage/01_public/humob-challenge-2024/"

In [3]:
# cityA_df = pl.read_csv(base_path + "input/cityA_groundtruthdata.csv.gz")
cityB_df = pl.read_csv(base_path + "input/cityB_challengedata.csv.gz")
# cityC_df = pl.read_csv(base_path + "input/cityC_challengedata.csv.gz")
# cityD_df = pl.read_csv(base_path + "input/cityD_challengedata.csv.gz")

#### 学習データと検証データのsplit

- 検証用のため、cityBでuid<=2000のユーザのみ抽出 

In [4]:
train_cityB = cityB_df.filter((pl.col('uid') < 20000) | ((pl.col('uid') >= 20000) & (pl.col('d') <= 60)))
valid_cityB = cityB_df.filter((pl.col('uid') >= 20000) & ((pl.col('uid') < 22000) & (pl.col('d') > 60)))

### データの前処理

In [5]:
def preprocess_data(df):
    df = df.with_columns([
        pl.col("x").cast(pl.Utf8).str.zfill(3).alias("x"),
        pl.col("y").cast(pl.Utf8).str.zfill(3).alias("y")
        ])
    df = (
        df
        .with_columns([
            pl.col("x").map_elements(lambda x: f"x{x}").alias("x")
        ])
        .with_columns([
            pl.col("y").map_elements(lambda x: f"y{x}").alias("y")
        ])
    )
    return df

In [6]:
train_cityB = preprocess_data(train_cityB)
valid_cityB = preprocess_data(valid_cityB)
train_cityB.head()

uid,d,t,x,y
i64,i64,i64,str,str
0,0,20,"""x080""","""y099"""
0,0,21,"""x081""","""y097"""
0,0,25,"""x083""","""y102"""
0,0,26,"""x080""","""y101"""
0,0,27,"""x080""","""y101"""


In [7]:
def get_timedelta(df):
    """ユーザーIDごとに時間の差分値を付与して返す"""
    uid_list = []

    for uid, traj in tqdm(df.sort('uid').group_by('uid')):
        time_delta = np.insert((traj['d'].to_numpy()[1:] * 48 + traj['t'].to_numpy()[1:]) - (traj['d'].to_numpy()[:-1] * 48 + traj['t'].to_numpy()[:-1]), 0, 0)
        time_delta[time_delta > 47] = 47

        uid_list.append(
            time_delta,
        )

    return uid_list

In [None]:
train_delta_list = get_timedelta(train_cityB)
valid_delta_list = get_timedelta(valid_cityB)

In [None]:
train_cityB.write_csv("train_cityB_timedelta.csv")
valid_cityB# 新規列として追加
train_cityB = train_cityB.with_columns(pl.Series('timedelta', np.concatenate(train_delta_list)))
valid_cityB = valid_cityB.with_columns(pl.Series('timedelta', np.concatenate(valid_delta_list)))

: 

#### 欠損値の補完
- 欠損しているデータは'N'として文字列補完する

In [7]:
target_df = filtered_cityB_df
# 全uidのリストを取得
uids = target_df['uid'].unique()

# d: 0-74, t: 0-47 の全組み合わせを作成
all_days = list(range(60))
all_times = list(range(48))
all_combinations = list(itertools.product(all_days, all_times))

# 全uidに対して欠損している組み合わせを補完
full_data = []

for uid in uids:
    # 既存のデータをフィルタリング
    existing_data = target_df.filter(pl.col('uid') == uid)
    
    # 全組み合わせをDataFrameに変換
    full_combinations = pl.DataFrame(all_combinations, schema=['d', 't'])
    full_combinations = full_combinations.with_columns([
        pl.lit(uid).cast(pl.Int64).alias('uid')  # キャストを追加
    ])
    
    # 既存データと全組み合わせをマージして、欠損部分を見つける
    merged_data = full_combinations.join(existing_data, on=['uid', 'd', 't'], how='left')
    
    # x, y の欠損を 'N' で埋める
    merged_data = merged_data.fill_null('N')
    
    # 結果を追加
    full_data.append(merged_data)

# 結果を結合
full_df = pl.concat(full_data)

# 結果を保存
# full_data.write_csv('full_data.csv')

# 結果を確認
print(full_df)

shape: (5_760_000, 5)
┌─────┬─────┬──────┬──────┬──────┐
│ d   ┆ t   ┆ uid  ┆ x    ┆ y    │
│ --- ┆ --- ┆ ---  ┆ ---  ┆ ---  │
│ i64 ┆ i64 ┆ i64  ┆ str  ┆ str  │
╞═════╪═════╪══════╪══════╪══════╡
│ 0   ┆ 0   ┆ 0    ┆ N    ┆ N    │
│ 0   ┆ 1   ┆ 0    ┆ N    ┆ N    │
│ 0   ┆ 2   ┆ 0    ┆ N    ┆ N    │
│ 0   ┆ 3   ┆ 0    ┆ N    ┆ N    │
│ 0   ┆ 4   ┆ 0    ┆ N    ┆ N    │
│ …   ┆ …   ┆ …    ┆ …    ┆ …    │
│ 59  ┆ 43  ┆ 1999 ┆ N    ┆ N    │
│ 59  ┆ 44  ┆ 1999 ┆ N    ┆ N    │
│ 59  ┆ 45  ┆ 1999 ┆ x075 ┆ y073 │
│ 59  ┆ 46  ┆ 1999 ┆ x074 ┆ y074 │
│ 59  ┆ 47  ┆ 1999 ┆ x076 ┆ y074 │
└─────┴─────┴──────┴──────┴──────┘


In [8]:
full_df = (
    full_df.with_columns([
        pl.concat_str([pl.col("x"), pl.col("y")]).alias("xy")
    ])
)
full_df.tail()

d,t,uid,x,y,xy
i64,i64,i64,str,str,str
59,43,1999,"""N""","""N""","""NN"""
59,44,1999,"""N""","""N""","""NN"""
59,45,1999,"""x075""","""y073""","""x075y073"""
59,46,1999,"""x074""","""y074""","""x074y074"""
59,47,1999,"""x076""","""y074""","""x076y074"""


In [None]:
#["x01y02,x02y03,,,,,|13,2,4......|"]
#["x01y02_1_,x02y03,,,,,|13,2,4......|"]

In [9]:
# 日付ごとにValue値を結合
df_aggregated = full_df.group_by('uid', 'd').agg([
    pl.concat_str("xy", separator=",").alias('xy')
]).sort("uid", "d")
df_aggregated.tail()

uid,d,xy
i64,i64,list[str]
1999,55,"[""NN"", ""NN"", … ""NN""]"
1999,56,"[""NN"", ""NN"", … ""x075y073""]"
1999,57,"[""x074y074"", ""NN"", … ""x075y073""]"
1999,58,"[""NN"", ""NN"", … ""x075y073""]"
1999,59,"[""NN"", ""NN"", … ""x076y074""]"


## Step 2. Model Training
GPT2モデルによるFine-Tunningを実施する。  

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer

In [None]:
# Trainerを使用した基本的な実装例


model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# カスタムデータクラス


In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [None]:
# you need to set parameters 
train_file_path = "/content/drive/MyDrive/Articles.txt"
model_name = 'gpt2'
output_dir = '/content/drive/MyDrive/result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [None]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

## Step 3. Inference

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/content/drive/MyDrive/result"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
sequence = input() # oil price
max_len = int(input()) # 20
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed

The following process may be a little more complicated or tedious because you have to write the code one by one, and it takes a long time if you don't have a personal GPU.

Then, how about use Ainize's Teachable NLP? Teachable NLP provides an API to use the model so when data is input it will automatically learn quickly.

Teachable NLP : [https://ainize.ai/teachable-nlp](https://link.ainize.ai/3tJVRD1)

Teachable NLP Tutorial : [https://forum.ainetwork.ai/t/teachable-nlp-how-to-use-teachable-nlp/65](https://link.ainize.ai/3tATaUh)