In [1]:
import polars as pl
import pandas as pd
import numpy as np
import re
import itertools
from tqdm import tqdm
import pickle
import time
import copy
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer
import pickle

2024-09-07 10:24:31.432259: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-07 10:24:31.432310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-07 10:24:31.433203: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


#### 読み込むデータパスの指定

In [2]:
base_path = "/kaggle/s3storage/01_public/humob-challenge-2024/"

## Step 2. Model Training
GPT2モデルによるFine-Tunningを実施する。  

In [34]:
class HumobDataset(Dataset):
    def __init__(self, humob_dataset_path):
        super().__init__()

        self.uid_list = []
        self.data_token = "<data>"
        self.end_of_text_token = "</data>"

        # データのロード
        self.df = pl.read_csv(humob_dataset_path)
        
        # UIDごとにデータを加工
        self.processed_data = self._process_data()

    def _process_data(self):
        data_list = []

        # UIDごとにグループ化
        grouped = self.df.groupby('uid')

        for uid, group in grouped:
            # UIDを追加
            result = []
            result.append(f"{uid}")

            # <|data|> を追加
            result.append(self.data_token)

            # 日ごとのデータを処理
            for day, day_group in group.sort('d').groupby('d'):
                # <|dowX|> を追加
                dow = f"<dow{day % 7}>"
                result.append(dow)
                
                # x, y のデータを連結して追加
                movement_data = ''.join(f"{t},{x}{y}." for t, x, y in zip(day_group['timedelta'],day_group['x'], day_group['y']))
                result.append(movement_data)
                end_dow = f"</dow{day % 7}>"
                result.append(end_dow)

            # 終了トークンの追加
            result.append(self.end_of_text_token)
            text = "".join(result)
            data_list.append(text)

        return data_list

    def __len__(self):
        return len(self.processed_data)

    def __getitem__(self, idx):
        return self.processed_data[idx]

In [38]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = HumobDataset(train_file_path)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [39]:
# you need to set parameters 
train_file_path = base_path + "feature/train_cityB_timedelta.csv"
model_name = 'gpt2'
output_dir = './'
overwrite_output_dir = False
per_device_train_batch_size = 1
num_train_epochs = 1
save_steps = 500

In [40]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

## Step 3. Inference

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/content/drive/MyDrive/result"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
sequence = input() # oil price
max_len = int(input()) # 20
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed

The following process may be a little more complicated or tedious because you have to write the code one by one, and it takes a long time if you don't have a personal GPU.

Then, how about use Ainize's Teachable NLP? Teachable NLP provides an API to use the model so when data is input it will automatically learn quickly.

Teachable NLP : [https://ainize.ai/teachable-nlp](https://link.ainize.ai/3tJVRD1)

Teachable NLP Tutorial : [https://forum.ainetwork.ai/t/teachable-nlp-how-to-use-teachable-nlp/65](https://link.ainize.ai/3tATaUh)