In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import pandas as pd
from json import load as jload
from random import shuffle
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from transformers import TrainingArguments
from wandb import init, login as wandb_login
from google.colab import userdata
from transformers import Trainer
import torch
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge
from rich.table import Table

In [None]:
!tar -xvzf data.tgz
data = jload(open('/content/askubuntu_stream.json', 'r') )
data.extend(jload(open('/content/superuser_stream.json', 'r')))

superuser_stream.json
askubuntu_stream.json


In [None]:
df = pd.DataFrame(data)

def create_input_text(row):
    discussion_text = " ".join(row['discussions'].splitlines())
    return f"refine question: {row['initial_title']} {row['initial_body']} discussion: {discussion_text}"

def create_target_text(row):
    return f"{row['final_title']} {row['final_body']}"

df['input_text'] = df.apply(create_input_text, axis=1)
df['target_text'] = df.apply(create_target_text, axis=1)

train_df = df.sample(frac=0.8, random_state=42)
eval_df = df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

print(train_df[['input_text', 'target_text']].head())

                                          input_text  \
0  refine question: Windows 7 XP Mode disable tim...   
1  refine question: Simplest way to use graphic a...   
2  refine question: Realtek HD Audio playing weir...   
3  refine question: SSD Installation error: Read-...   
4  refine question: windows media player can't pl...   

                                         target_text  
0  Windows 7 XP Mode disable time sync So I've tr...  
1  Simplest way to use graphic acceleration to wa...  
2  Realtek HD Audio playing weird with certain vi...  
3  SSD Installation error: Read-only file system:...  
4  windows media player can't play DVDs even with...  


In [None]:
model_name = 't5-base'  # You can change this to 't5-base' or 't5-large'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
wandb_login(key=userdata.get("WANDB_TOKEN"))
run = init(
    project='t4-qna-moderation',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mruturajk2404[0m ([33mruturajk2404-pict[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
class QuestionRefinementDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_len, target_max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.source_max_len = source_max_len
        self.target_max_len = target_max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        input_text = row['input_text']
        target_text = row['target_text']

        source_encoding = self.tokenizer(
            input_text,
            max_length=self.source_max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=self.target_max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten(),
            'decoder_attention_mask': target_encoding['attention_mask'].flatten(),
        }

SOURCE_MAX_LEN = 512
TARGET_MAX_LEN = 128

train_dataset = QuestionRefinementDataset(train_df, tokenizer, SOURCE_MAX_LEN, TARGET_MAX_LEN)
eval_dataset = QuestionRefinementDataset(eval_df, tokenizer, SOURCE_MAX_LEN, TARGET_MAX_LEN)

In [None]:
training_args = TrainingArguments(
    output_dir='./t5_question_refinement',  # Directory to save checkpoints
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-4,
    num_train_epochs=3,  # Adjust as needed
    logging_dir='./logs',
    logging_steps=500,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    warmup_steps=500,
    weight_decay=0.01,
    report_to="wandb"  # Or "wandb" or "tensorboard" for more detailed tracking
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.0811,0.220075
2,0.2214,0.218758
3,0.1942,0.214885


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2799, training_loss=0.3660777011229422, metrics={'train_runtime': 4041.7419, 'train_samples_per_second': 5.539, 'train_steps_per_second': 0.693, 'total_flos': 1.363213133807616e+16, 'train_loss': 0.3660777011229422, 'epoch': 3.0})

In [None]:
from huggingface_hub import login as hf_login
hf_login(userdata.get('HF_TOKEN'))
tokenizer.push_to_hub("thedev3301/t5-question-enhancement")
model.push_to_hub("thedev3301/t5-question-enhancement")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/thedev3301/t5-question-enhancement/commit/de307313f6496ec9c388f09f9994036f59b2cdad', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='de307313f6496ec9c388f09f9994036f59b2cdad', pr_url=None, repo_url=RepoUrl('https://huggingface.co/thedev3301/t5-question-enhancement', endpoint='https://huggingface.co', repo_type='model', repo_id='thedev3301/t5-question-enhancement'), pr_revision=None, pr_num=None)

In [None]:
def evaluate_model_batched(model, eval_dataset, tokenizer, max_length=128, batch_size=32, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.eval()
    model.to(device)
    predictions = []
    references = []
    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False) # Shuffle is usually False for evaluation

    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                num_beams=4,
                early_stopping=True
            )

            batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            batch_references = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(batch_predictions)
            references.extend(batch_references)

    return predictions, references

In [None]:
# model.to("cuda")

predictions, references = evaluate_model_batched(model, eval_dataset, tokenizer, max_length=TARGET_MAX_LEN)


smoothing = SmoothingFunction().method4
bleu_score = corpus_bleu([[ref.split()] for ref in references], [pred.split() for pred in predictions], smoothing_function=smoothing)

rouge = Rouge()
rouge_scores = rouge.get_scores(predictions, references, avg=True)

print(f"BLEU Score: {bleu_score}")
print(f"ROUGE Scores: {rouge_scores}")

BLEU Score: 0.8867205432476581
ROUGE Scores: {'rouge-1': {'r': 0.9153954630303685, 'p': 0.9323206697442847, 'f': 0.9199009348193602}, 'rouge-2': {'r': 0.8836217038882133, 'p': 0.9025873978232651, 'f': 0.8889337980072143}, 'rouge-l': {'r': 0.9144634609529863, 'p': 0.9313083496891066, 'f': 0.9189383106926892}}


In [None]:
internet_custom_texts_ctxt = [
    "Macbook Pro strange display issues: Last week my Macbook Pro from 2012 suddenly had a display issue. I was working on something using my external monitor (through thunderbolt -> displayPort) when the screen showed some banding and the computer suddenly turned itself off..",
    "Problem with my internet connection: Recently I see that I often lose internet connectivity on my Linux machines. I am starting to think that it may be a router problem, but I have no idea to check if this is indeed the problem",
    "Intermittent Wi-Fi disconnections and slow speeds on Ubuntu 20.04: I just boot a new ubuntu 20.04.3, and when I restart my wifi gets slow, and sometimes it automatically disconnected"
]

In [None]:
final_titles = [
    "2012 Macbook Pro Display Issues with External Monitor (Thunderbolt)",
    "Troubleshooting Intermittent Internet Connectivity on Linux potentially a Router Issue)",
    "Ubuntu 20.04: Wi-Fi Slows/Disconnects After every Restart"
]

In [None]:
responses = []
model.eval()
model.to("cuda")
for text in internet_custom_texts_ctxt:
  tokens = tokenizer(
      text,
      max_length=1024,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors="pt",
  )
  input_ids = tokens["input_ids"].to("cuda")
  att = tokens["attention_mask"].to("cuda")
  response = model.generate(input_ids=input_ids, attention_mask=att)

  generated_token_ids = response[0]
  decoded_response = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
  responses.append(decoded_response)

In [None]:
table = Table(title="Generated output")
table.add_column("Initial Title", style="bright_white")
table.add_column("Context", style="bright_yellow")
table.add_column("Final Title", style="bright_green")
for initial, final in zip(internet_custom_texts_ctxt, responses):
  ctxt = initial.split(": ")
  table.add_row(
      initial,
      ctxt,
      final
  )

print(table)