In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


In [10]:
from peft import LoraConfig, get_peft_model

def fine_tune_gpt2(model_name, train_file, output_dir):
    # Load GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)

    config = LoraConfig(
        r=32,
        lora_alpha=32,
        target_modules=["c_proj"],
        lora_dropout=0.1,
        bias="lora_only",
        modules_to_save=["decode_head"],
    )
    lora_model = get_peft_model(model, config)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load training dataset
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)
    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    # Set training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        save_steps=500,
    )
    # Train the model
    trainer = Trainer(
        model=lora_model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        tokenizer= tokenizer
    )
    trainer.train()
    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [15]:
from datasets import load_dataset


In [23]:
import sys
sys.path.insert(0, '../data')

import data_funcs as df
import importlib
importlib.reload(df)

<module 'data_funcs' from '/home/webs8328/cs182_proj/gpt_code/../data/data_funcs.py'>

In [34]:
ds_test = df.get_dataset()

In [25]:
print(ds)

Dataset({
    features: ['source', 'term', 'quantifier_frequency', 'quantifier_number', 'generic_sentence', 'score'],
    num_rows: 3433000
})


In [27]:
df.format_data_yoda(ds, [1, 3, 5])

['User: Tell me about a.net income.\nYoda: By net sales, a.net income is divided. ',
 'User: Tell me about a.sound wave.\nYoda: Reflect, a.sound waves can. ',
 'User: Tell me about a.this chapter.\nYoda: With some wonderful images, a.this chapter begins. ']

In [51]:
import inspect

signature = inspect.signature(ds.__getattribute__).parameters
for name, parameter in signature.items():
    print(name, parameter.default, parameter.annotation, parameter.kind)


name <class 'inspect._empty'> <class 'inspect._empty'> POSITIONAL_ONLY


In [35]:
print(str(ds_test.__class__.__name__))

Dataset


In [65]:
for i in df.format_data_yoda(ds, [1, 2, 434,3, 2,23000, 5, 4,2, 29384, 1298]):
    print(i)

User: Tell me about a.net income.
Yoda: By net sales, a.net income is divided. 
User: Tell me about a.p.e motorcycle.
Yoda: In bondage videos, a.p.e motorcycle babes specializes. 
User: Tell me about aba.
Yoda: On data to make decisions about behavior, aba relies. 
User: Tell me about a.sound wave.
Yoda: Reflect, a.sound waves can. 
User: Tell me about a.p.e motorcycle.
Yoda: In bondage videos, a.p.e motorcycle babes specializes. 
User: Tell me about action.
Yoda: Into account immediately, every action is taken. 
User: Tell me about a.this chapter.
Yoda: With some wonderful images, a.this chapter begins. 
User: Tell me about a.their image.
Yoda: A.their images are inverted. 
User: Tell me about a.p.e motorcycle.
Yoda: In bondage videos, a.p.e motorcycle babes specializes. 
User: Tell me about actual fishing.
Yoda: After class, actual fishing is encouraged. 
User: Tell me about abiotic factor.
Yoda: A physical or nonliving factor that shapes an ecosystem, abiotic factor is. 


In [44]:
ds_test.__delattr__('source')

AttributeError: source

In [45]:
dir(ds_test)

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mappin

In [48]:
ds.__getattribute__('term')

AttributeError: 'Dataset' object has no attribute 'term'

In [58]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path="/home/webs8328/cs182_proj/data/untitled.txt",
        block_size=128)

In [63]:
print(train_dataset)

<transformers.data.datasets.language_modeling.TextDataset object at 0x7fa2c9742760>


In [None]:
fine_tune_gpt2("gpt2", "/home/webs8328/cs182_proj/data/train_text.txt", "test_output2")



Step,Training Loss
500,2.7646
1000,2.2172
1500,2.1716
2000,2.1295
2500,2.1061


In [6]:
from transformers import GPT2Model

model = GPT2Model.from_pretrained("gpt2")
for i in model.named_modules():
    print(i)


('', GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
))
('wte', Embedding(50257, 768))
('wpe', Embedding(1024, 768))
('drop', Dropout(p=0.1, inplace=False))
('h', ModuleList(
  (0-11): 12 x GPT2Block(
    (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (attn): GPT2Attention(
      (c_attn): Con

In [9]:
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

print(model)

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)
