# Starbot Transformers
Final project for CSE 498 G1.

## Importing HuggingFace Transformers
Follows tutorial for Language modeling from [HuggingFace Transformers](https://huggingface.co/docs/transformers/main/en/tasks/language_modeling#masked-language-modeling)

In [None]:
!pip install transformers datasets evaluate
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


## Import Dataset for Fine-Tuning

In [None]:
from datasets import load_dataset
from google.colab import drive
import os

# Load dataset from Google Drive
drive.mount("/gdrive")
!ls /gdrive
BASE_PATH = "/gdrive/My Drive/colab_files/starbot-transformers/"

Mounted at /gdrive
MyDrive  Shareddrives


In [None]:
# Load dataset from CSV
dataset = load_dataset("csv", data_files=BASE_PATH + "general.csv", split="train[:]")
# filter empty
dataset = dataset.filter(lambda e: e["content"] is not None)
# filter links
dataset = dataset.filter(lambda e: "http" not in e["content"])
dataset = dataset.train_test_split(test_size=0.2)
print(f'{len(dataset["train"])} training examples, {len(dataset["test"])} test examples')
dataset["train"]["content"][:10]



  0%|          | 0/257 [00:00<?, ?ba/s]

  0%|          | 0/249 [00:00<?, ?ba/s]

196888 training examples, 49222 test examples


['si :(',
 "And it's a duet",
 'Get this NTR filth out of here',
 'gdi jarrett',
 'thanks',
 'well',
 'ok sure thing. lemme try that',
 'Precious',
 'I care way more about sound, composition, and production especially than lyrics',
 "i just know it's very delicious and highly rated"]

# Data Preprocessing
## Import Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Tokenize Input

In [None]:
tokenizer('<startmsg>')

{'input_ids': [27, 9688, 19662, 29], 'attention_mask': [1, 1, 1, 1]}

In [None]:
msg_start_token = '<msg>'
msg_end_token = '</msg>'

def preprocess_function(example):
    content = example["content"].lower()
    content = f'{msg_start_token}{content}{msg_end_token}'
    return tokenizer(content)
    
tokenized_dataset = dataset.map(
    preprocess_function,
    remove_columns=dataset["train"].column_names,
)

sample_id = tokenized_dataset["train"]["input_ids"][0]
print(sample_id)
print(tokenizer.decode(sample_id))

  0%|          | 0/7162 [00:00<?, ?ex/s]

  0%|          | 0/1791 [00:00<?, ?ex/s]

[27, 19662, 29, 2188, 354, 3754, 64, 13674, 616, 6621, 318, 2279, 257, 467, 354, 3754, 64, 3807, 815, 1053, 587, 290, 326, 338, 477, 340, 2476, 284, 307, 3556, 19662, 29]
<msg>gochiusa dear my sister is everything a gochiusa movie should've been and that's all it needs to be</msg>


## Block Input

In [None]:
block_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(
    group_texts, 
    batched=True,
    num_proc=4
)

tokenizer.decode(lm_dataset["train"]["input_ids"][0])

       

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

"<msg>gochiusa dear my sister is everything a gochiusa movie should've been and that's all it needs to be</msg><msg><@503787499605458954> hows that tony essay coming along?</msg><msg>like you'd have to be <@472541429130854421>'s frame just to start</msg><msg>additional stone-less timelines right</msg><msg>i understand now, and i would agree</msg><msg>i invite you to share why as well</msg><msg>"

# Padding for Causal Language Modeling

In [None]:
from transformers import DataCollatorForLanguageModeling

# tokenizer.bos_token = bos_token
# tokenizer.eos_token = eos_token
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Import Model
Model can be changed.

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model_id = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(model_id)

Downloading:   0%|          | 0.00/353M [00:00<?, ?B/s]

## Define Fine-tuning Training Parameters

In [None]:
training_args = TrainingArguments(
    output_dir="starbot-transformers",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    learning_rate=0.005,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/content/starbot-transformers is already a clone of https://huggingface.co/wenjalan/starbot-transformers. Make sure you pull the latest changes with `repo.git_pull()`.


# Train Model

In [None]:
trainer.train()
# trainer.push_to_hub()

***** Running training *****
  Num examples = 920
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 575
  Number of trainable parameters = 81912576


Epoch,Training Loss,Validation Loss
1,No log,3.924803
2,No log,3.892054
3,No log,3.993219
4,No log,4.216694
5,3.067600,4.55771


***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
Saving model checkpoint to starbot-transformers/checkpoint-500
Configuration saved in starbot-transformers/checkpoint-500/config.json
Model weights saved in starbot-transformers/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=575, training_loss=2.9759375, metrics={'train_runtime': 138.7029, 'train_samples_per_second': 33.164, 'train_steps_per_second': 4.146, 'total_flos': 150245631590400.0, 'train_loss': 2.9759375, 'epoch': 5.0})

# Evaluate Model

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 3846
  Batch size = 8


Perplexity: 2476.20


# Inference

In [None]:
# !pip install transformers
from transformers import pipeline
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
generator = pipeline(
  "text-generation", 
  model="wenjalan/starbot-transformers", 
  tokenizer=tokenizer
)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Downloading:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/334M [00:00<?, ?B/s]

In [None]:
output = generator("<msg>what")
output[0]["generated_text"].split("</msg>")[0][5:]

ValueError: ignored

In [None]:
msg_start_token = "<msg>"
msg_end_token = "<msg>"

prompt=f'{msg_start_token}'
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(
    inputs, 
    max_length=100,
    pad_token_id=tokenizer.eos_token_id, 
    # eos_token_id=tokenizer(msg_end_token)['input_ids'], 
    # bos_token_id=tokenizer.bos_token_id,
    # do_sample=True,
    temperature=0.7
)
tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].split(msg_end_token)[0][5:]

TypeError: ignored