In [5]:
%%capture
%pip install transformers huggingface_hub datasets==2.16.0

In [6]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForQuestionAnswering,GPT2LMHeadModel,
    TrainingArguments,
    pipeline,
    logging,
    Trainer,
    DataCollatorForLanguageModeling
)
from huggingface_hub import notebook_login, login

from sklearn.model_selection import train_test_split

import pandas as pd


In [7]:
token="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"

In [8]:
dataset = pd.read_csv('train.csv')
dataset = dataset.drop('qtype', axis=1)
dataset = dataset.rename(columns={'Question': 'question', 'Answer': 'answer'})

In [9]:
df_full_train, df_test = train_test_split(dataset, test_size=0.2, random_state=56)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=56)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_train.reset_index(drop=True)
df_test = df_train.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

In [11]:
health_dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [12]:
MODEL_NAME = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [13]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
def preprocess_function(examples):
    inputs = [q + " [SEP] " + a for q, a in zip(examples["question"], examples["answer"])]
   # The "inputs" are the tokenized answer:
#    inputs = [doc for doc in examples["question"] + " [SEP] " + doc for doc in examples["answer"]]
    
    model_inputs = tokenizer(inputs, max_length=200, truncation=True, padding=True, return_tensors="pt")
  
   # The "labels" are the tokenized outputs:
    return model_inputs

In [15]:
tokenized_dataset = health_dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/9843 [00:00<?, ? examples/s]

Map:   0%|          | 0/9843 [00:00<?, ? examples/s]

Map:   0%|          | 0/9843 [00:00<?, ? examples/s]

In [18]:

def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [21]:
model2_path = "la-min/gpt-2-health-faq"
sequence2 = "[Q] What is bacteria?"
max_len = 50
generate_text(model2_path, sequence2, max_len) 

[Q] What is bacteria? The bacteria that cause Q fever are called cutaneous enterococci. These bacteria are found throughout the body and in the lungs. They can cause anemia, which is a condition that affects red blood cells that your


In [20]:
model2_path = "la-min/gpt-2-health-faq"
sequence2 = "[Q] Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?"
max_len = 50
generate_text(model2_path, sequence2, max_len) 

[Q] Who is at risk for Lymphocytic Choriomeningitis (LCM)?? [SEP] The disease is one of the most serious illnesses affecting the lymph system. For many people, the disease is linked to a
