In [None]:
# Authors: Nina Koh, Chelsea Kendrick
# Setup: Clone repo & install dependencies

!rm -rf UW-LING-573-Group-5
!git clone https://github.com/chelsk5/UW-LING-573-Group-5.git

!pip install -r UW-LING-573-Group-5/requirements.txt

In [None]:
%cd UW-LING-573-Group-5/
%ls

In [None]:
# 1. Load data
from datasets import load_dataset
opinosis = load_dataset("kavgan/opinosis", split="train") # HuggingFace hub
dataset = opinosis.train_test_split(test_size=0.2,seed=1) # follow common split ratio
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
# 2. Preprocessing
def clean_data(sent_to_sum):
  """Cleans the given review and returns a dictionary
  of aggregated reviews and the first gold summary"""
  reviews = sent_to_sum["review_sents"].strip()
  # Choose the first gold summary
  gold_summary = sent_to_sum["summaries"][0].strip()
  return {
      "reviews": reviews,
      "gold_summary": gold_summary
  }

# Sanity check!
# Apply cleaning to a small sample
data = load_dataset("kavgan/opinosis")["train"]
cleaned_data = data.map(clean_data)

# Print the result
print("Input reviews:")
print(cleaned_data[0]["reviews"])
print("\nTarget summary:")
print(cleaned_data[0]["gold_summary"])

print("\nInput reviews:")
print(cleaned_data[1]["reviews"])
print("\nTarget summary:")
print(cleaned_data[1]["gold_summary"])

In [None]:
# 3. Load tokenizer & model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# 4. Tokenize reviews & gold summaries
def tokenize_data(cleaned_data):
  """Convert the raw text into token IDs for model processing"""
  # Prepend task for T5 to execute
  pref_text = "summarize: " + cleaned_data["reviews"]

  # Tokenize input (reviews)
  # Ensure that text does not exceed (is in fact exactly) maximum of 512 tokens
  tokenized_input = tokenizer(pref_text, max_length=512, truncation=True, padding="max_length")

  # Tokenize target output (gold summary)
  gold_summary = cleaned_data["gold_summary"]
  labels = tokenizer([gold_summary], max_length=64, truncation=True) # expects a list

  # Add target labels to the model input
  # Format for HuggingFace Trainer
  tokenized_input["labels"] = labels["input_ids"][0] # get first (& only) item of nested list
  return tokenized_input

#apply tokenization to data
tokenized_train = cleaned_data.map(tokenize_data, batched=False) # one row at a time

#check tokenization
print("Tokenized input example (input_ids):")
print(tokenized_train[0]["input_ids"])
print("Tokenized target (labels):")
print(tokenized_train[0]["labels"])



In [None]:
# 5.
