In [1]:
import argparse
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import os
import pickle
import numpy as np

# Metric
# metric = evaluate.load("rouge")
# Load additional metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
sacrebleu = evaluate.load("sacrebleu")

def preprocess_function(examples, tokenizer, max_input_length, max_output_length):
    inputs = ["summarize: " + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_output_length, padding="max_length", truncation=True)

    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels["input_ids"],
        "original_texts": examples["dialogue"]
    }

def evaluate_model(model, tokenizer, dataset, max_input_length, max_output_length):
    # wandb_table = wandb.Table(columns=["Index", "Original Text", "Actual Summary", "Predicted Summary"])
    
    predictions, references = [], []
    for index, sample in enumerate(tqdm(dataset)):
        if index > 10:
            break

        preprocessed = preprocess_function({"dialogue": [sample["dialogue"]], "summary": [sample["summary"]]}, tokenizer, max_input_length, max_output_length)
        input_ids = torch.tensor(preprocessed["input_ids"]).to(model.device)
        attention_mask = torch.tensor(preprocessed["attention_mask"]).to(model.device)

        with torch.no_grad():
            # outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=max_output_length)

            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=100, min_length=30, num_beams=5,
                                 length_penalty=2.0, no_repeat_ngram_size=2, early_stopping=True)
       
        
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append(sample["summary"])

        # if index < 10:
        print(f"Sample {index+1}:")
        print(f"Actual Summary: {sample['summary']}")
        print(f"Predicted Summary: {prediction}\n")
        print("---\n")

        # Add data to wandb table
        # wandb_table.add_data(index, sample["dialogue"], sample["summary"], prediction)

    # Compute metrics
    rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")
    bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    meteor_score = meteor.compute(predictions=predictions, references=references)
    sacrebleu_score = sacrebleu.compute(predictions=predictions, references=[[ref] for ref in references])

    # wandb.log({"evaluation_table": wandb_table})
    
    return {
        "rouge": rouge_scores,
        "bertscore": bert_scores,
        "bleu": bleu_score,
        "meteor": meteor_score,
        "sacrebleu": sacrebleu_score
    }


import wandb

def main(args):
    test_dataset = load_dataset("samsum", split="test")
    model = AutoModelForSeq2SeqLM.from_pretrained(args['model_path'])
    tokenizer = AutoTokenizer.from_pretrained("t5-large")

    print(f"\nMODEL: {args['model_path']}\n" )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device: ", device )

    model.to(device)
    # model.cuda()
    model.eval()

    max_input_length = 250
    # max_output_length = 150
    max_output_length = 150

    metrics_results = evaluate_model(model, tokenizer, test_dataset, max_input_length, max_output_length)

    # Initialize dictionary for wandb summary metrics
    wandb_summary_metrics = {}

    # Handle ROUGE metrics
    if 'rouge' in metrics_results:
        rouge_scores = metrics_results['rouge']
        for key, value in rouge_scores.items():
            if isinstance(value, dict) and 'fmeasure' in value:  # Check for the expected structure
                wandb_summary_metrics[f'{key}_fmeasure'] = round(value['fmeasure'] * 100, 2)
                print(f'{key} F-measure: {value["fmeasure"] * 100:.2f}%')
            elif isinstance(value, np.float64):  # Directly dealing with numerical values
                wandb_summary_metrics[key] = round(value * 100, 2)
                print(f'{key}: {value * 100:.2f}%')

    # Handle BERTScore metrics
    if 'bertscore' in metrics_results:
        bert_scores = metrics_results['bertscore']
        for score_type in ['precision', 'recall', 'f1']:
            wandb_summary_metrics[f'bertscore_{score_type}'] = np.mean(bert_scores[score_type])

    # For metrics like BLEU, Meteor, SacreBLEU, ensure they return a 'score'
    for metric_name in ['bleu', 'meteor', 'sacrebleu']:
        if metric_name in metrics_results and 'score' in metrics_results[metric_name]:
            score = metrics_results[metric_name]['score']
            wandb_summary_metrics[metric_name] = score
            # print(f'{metric_name}: {score:.2f}')
    
    print(f'wandb_summary_metrics: \n {wandb_summary_metrics}')
    # Log aggregated metrics to wandb
    # wandb.summary.update(wandb_summary_metrics)
    
    # Log the table as well
    # wandb.log({"evaluation_table": wandb_table})

    # wandb.finish()
    print("Evaluation completed and logged to wandb.")
    # wandb.finish()



[nltk_data] Downloading package wordnet to
[nltk_data]     /users/k/n/kngongiv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/k/n/kngongiv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/k/n/kngongiv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# evaluate_model.py --model_path='' --run_name='samsum_stage_dp' #EvaluateFinalSamSum_DP

args = {
    "model_path": "/users/k/n/kngongiv/Research/private_llm_generation/dialog/stage2/train/gen_model/samsum_stage_dp",  # Update this to your model's path
    "run_name": "dp",
}

main(args)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.



MODEL: /users/k/n/kngongiv/Research/private_llm_generation/dialog/stage2/train/gen_model/samsum_stage_dp

Device:  cpu


  0%|          | 1/819 [00:24<5:27:54, 24.05s/it]

Sample 1:
Actual Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
Predicted Summary: Amanda's number last time we were at the park together Larry called Betty' number Amanda asked Larry to call Bett Bett'.. Amanda was not shy, she was very nice.

---



  0%|          | 2/819 [00:43<4:47:33, 21.12s/it]

Sample 2:
Actual Summary: Eric and Rob are going to watch a stand-up on youtube.
Predicted Summary: I'm watching some of his stand-ups on youtube. Eric: Machine, Rob, and the machine. Rob: I like the train part.

---



  0%|          | 3/819 [01:09<5:18:28, 23.42s/it]

Sample 3:
Actual Summary: Lenny can't decide which trousers to buy. Bob advised Lenny on that topic. Lenny goes with Bob's advice to pick the trousers that are of best quality.
Predicted Summary: Leny: Babe, can you help me with something? Bob: I have purple pants, but I'm not sure which one I should buy. Le's pants are purple, and they're purple.

---



  0%|          | 4/819 [01:28<4:56:27, 21.82s/it]

Sample 4:
Actual Summary: Emma will be home soon and she will let Will know.
Predicted Summary: Will: what do you want for dinner tonight? Emma: I'm not worried about cooking Emma will be home soon Will will pick up Emma Emma.

---



  1%|          | 5/819 [01:45<4:31:54, 20.04s/it]

Sample 5:
Actual Summary: Jane is in Warsaw. Ollie and Jane has a party. Jane lost her calendar. They will get a lunch this week on Friday. Ollie accidentally called Jane and talked about whisky. Jane cancels lunch. They'll meet for a tea at 6 pm.
Predicted Summary: is in Warsaw Jane: are you free for diner the 19th and the 18th? Ollie: we have lunch this week Jane's not having any more whisky.

---



  1%|          | 6/819 [02:05<4:30:00, 19.93s/it]

Sample 6:
Actual Summary: Hilary has the keys to the apartment. Benjamin wants to get them and go take a nap. Hilary is having lunch with some French people at La Cantina. Hilary is meeting them at the entrance to the conference hall at 2 pm. Benjamin and Elliot might join them. They're meeting for the drinks in the evening.
Predicted Summary: I'm going to the conference hall at 2 pm and meet with some French people who are working on the history of food in colonial Mexico. Hilary: I have the keys and will meet them at lunch time.

---



  1%|          | 7/819 [02:27<4:41:56, 20.83s/it]

Sample 7:
Actual Summary: Payton provides Max with websites selling clothes. Payton likes browsing and trying on the clothes but not necessarily buying them. Payton usually buys clothes and books as he loves reading.
Predicted Summary: Payton: I like to shop, but I don't always buy clothes from the same sites. Max: What about clothes? Pa: Max knows some good clothes to buy from. Pa is looking for clothes.

---



  1%|          | 8/819 [02:45<4:25:28, 19.64s/it]

Sample 8:
Actual Summary: Rita and Tina are bored at work and have still 4 hours left.
Predicted Summary: I'm so tired at work Rita is looking at the keyboard hoping that the boss doesn't notice. Tina: She's not cut out for this bore.

---



  1%|          | 9/819 [03:02<4:14:31, 18.85s/it]

Sample 9:
Actual Summary: Beatrice wants to buy Leo a scarf, but he doesn't like scarves. She cares about his health and will buy him a scarf no matter his opinion.
Predicted Summary: I'm in town shopping and I have nice scarfs in the shop next to the church. Leatrice: I don't like them.

---



  1%|          | 10/819 [03:22<4:19:20, 19.23s/it]

Sample 10:
Actual Summary: Eric doesn't know if his parents let him go to Ivan's brother's wedding. Ivan will talk to them.
Predicted Summary: Ivan and Eric are coming to the wedding Eric andvan are going to his brother's wedding.van is not sure about his parents Eric is going Eric to talk to them Eric.

---



  1%|▏         | 11/819 [03:40<4:30:07, 20.06s/it]

Sample 11:
Actual Summary: Wanda wants to throw a party. She asks Gina to borrow her father's car and go do groceries together. They set the date for Friday. 
Predicted Summary: a Wanda: Let's make party, Gina, anda.a want to have some fun. shea to make a list and then go to grocery.

---




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


rouge1: 34.70%
rouge2: 10.12%
rougeL: 23.13%
rougeLsum: 23.12%
wandb_summary_metrics: 
 {'rouge1': 34.7, 'rouge2': 10.12, 'rougeL': 23.13, 'rougeLsum': 23.12, 'bertscore_precision': 0.8653651204976168, 'bertscore_recall': 0.8848728483373468, 'bertscore_f1': 0.8749148737300526, 'sacrebleu': 7.1901901720004995}
Evaluation completed and logged to wandb.


In [3]:


import argparse
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import os
import pickle
import numpy as np

# Metric
# metric = evaluate.load("rouge")
# Load additional metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
sacrebleu = evaluate.load("sacrebleu")

def preprocess_function(examples, tokenizer, max_input_length, max_output_length):
    inputs = ["summarize: " + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_output_length, padding="max_length", truncation=True)

    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels["input_ids"],
        "original_texts": examples["dialogue"]
    }

def evaluate_model(model, tokenizer, dataset, max_input_length, max_output_length):
    # wandb_table = wandb.Table(columns=["Index", "Original Text", "Actual Summary", "Predicted Summary"])
    
    predictions, references = [], []
    for index, sample in enumerate(tqdm(dataset)):
        if index > 10:
            break

        preprocessed = preprocess_function({"dialogue": [sample["dialogue"]], "summary": [sample["summary"]]}, tokenizer, max_input_length, max_output_length)
        input_ids = torch.tensor(preprocessed["input_ids"]).to(model.device)
        attention_mask = torch.tensor(preprocessed["attention_mask"]).to(model.device)

        #  Set generation parameters
        generation_parameters = {
            "max_length": 150,
            "min_length": 50,
            "no_repeat_ngram_size": 2,
            "num_beams": 5,
            "early_stopping": True,
            "temperature": 0.8,
            "top_k": 50,
            "top_p": 0.95,
            "do_sample": True,  # Enable sampling to use top_k & top_p parameters
        }

        # length_penalty=2.0,  # Length penalty to encourage longer sequences
        # no_repeat_ngram_size=2,  # No repeat ngram size to discourage repetition
        # ------
        # Nucleus Sampling (Top-p sampling)
        # output_sequences = model.generate(
        #     input_ids=input_ids,
        #     max_length=50,
        #     do_sample=True,  # Enable sampling
        #     top_p=0.92,  # Top-p value
        #     top_k=0,  # Disable top-k

        with torch.no_grad():
            # outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=max_output_length)
            outputs = model.generate(input_ids, **generation_parameters)

            # outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=100, min_length=30, num_beams=5,
            #                      length_penalty=2.0, no_repeat_ngram_size=2, early_stopping=True)
       
        
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append(sample["summary"])

        # if index < 10:
        print(f"Sample {index+1}:")
        print(f"Actual Summary: {sample['summary']}")
        print(f"Predicted Summary: {prediction}\n")
        print("---\n")

        # Add data to wandb table
        # wandb_table.add_data(index, sample["dialogue"], sample["summary"], prediction)

    # Compute metrics
    rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")
    bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    meteor_score = meteor.compute(predictions=predictions, references=references)
    sacrebleu_score = sacrebleu.compute(predictions=predictions, references=[[ref] for ref in references])

    # wandb.log({"evaluation_table": wandb_table})
    
    return {
        "rouge": rouge_scores,
        "bertscore": bert_scores,
        "bleu": bleu_score,
        "meteor": meteor_score,
        "sacrebleu": sacrebleu_score
    }


import wandb

def main(args):
    test_dataset = load_dataset("samsum", split="test")
    model = AutoModelForSeq2SeqLM.from_pretrained(args['model_path'])
    tokenizer = AutoTokenizer.from_pretrained("t5-large")

    print(f"\nMODEL: {args['model_path']}\n" )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device: ", device )

    model.to(device)
    # model.cuda()
    model.eval()

    max_input_length = 250
    # max_output_length = 150
    max_output_length = 150

    metrics_results = evaluate_model(model, tokenizer, test_dataset, max_input_length, max_output_length)

    # Initialize dictionary for wandb summary metrics
    wandb_summary_metrics = {}

    # Handle ROUGE metrics
    if 'rouge' in metrics_results:
        rouge_scores = metrics_results['rouge']
        for key, value in rouge_scores.items():
            if isinstance(value, dict) and 'fmeasure' in value:  # Check for the expected structure
                wandb_summary_metrics[f'{key}_fmeasure'] = round(value['fmeasure'] * 100, 2)
                print(f'{key} F-measure: {value["fmeasure"] * 100:.2f}%')
            elif isinstance(value, np.float64):  # Directly dealing with numerical values
                wandb_summary_metrics[key] = round(value * 100, 2)
                print(f'{key}: {value * 100:.2f}%')

    # Handle BERTScore metrics
    if 'bertscore' in metrics_results:
        bert_scores = metrics_results['bertscore']
        for score_type in ['precision', 'recall', 'f1']:
            wandb_summary_metrics[f'bertscore_{score_type}'] = np.mean(bert_scores[score_type])

    # For metrics like BLEU, Meteor, SacreBLEU, ensure they return a 'score'
    for metric_name in ['bleu', 'meteor', 'sacrebleu']:
        if metric_name in metrics_results and 'score' in metrics_results[metric_name]:
            score = metrics_results[metric_name]['score']
            wandb_summary_metrics[metric_name] = score
            # print(f'{metric_name}: {score:.2f}')
    
    print(f'wandb_summary_metrics: \n {wandb_summary_metrics}')
    print("Evaluation completed and logged to wandb.")

args = {
    "model_path": "/users/k/n/kngongiv/Research/private_llm_generation/dialog/stage2/train/gen_model/samsum_stage_dp_1",  # Update this to your model's path
    "run_name": "dp",
}

main(args)

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/k/n/kngongiv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/k/n/kngongiv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/k/n/kngongiv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



MODEL: /users/k/n/kngongiv/Research/private_llm_generation/dialog/stage2/train/gen_model/samsum_stage_dp_1

Device:  cpu


  0%|          | 1/819 [01:26<19:42:36, 86.74s/it]

Sample 1:
Actual Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
Predicted Summary: Amanda Amanda's number last time we were at the park together Larry called Betty' number Amanda asked Larry about his number. Amanda and Larry were talking at park and they were together Amanda: she'd like to text him bye.

---



  0%|          | 2/819 [01:56<12:03:10, 53.11s/it]

Sample 2:
Actual Summary: Eric and Rob are going to watch a stand-up on youtube.
Predicted Summary: I'm watching some of his stand-ups on youtube. Eric: Machine, Rob, and the machine are funny. Rob's stand ups are on YouTube Rob is on Youtube Rob has some standup on the Russian.

---



  0%|          | 3/819 [02:29<9:56:35, 43.87s/it] 

Sample 3:
Actual Summary: Lenny can't decide which trousers to buy. Bob advised Lenny on that topic. Lenny goes with Bob's advice to pick the trousers that are of best quality.
Predicted Summary: Leny: Babe, can you help me with something? Le: I have purple pants, but I'm not sure which one I should buy. Le's: the first pair is purple, and the second is black.

---



  0%|          | 4/819 [02:57<8:33:55, 37.84s/it]

Sample 4:
Actual Summary: Emma will be home soon and she will let Will know.
Predicted Summary: Will: what do you want for dinner tonight? Emma: I don't worry about cooking Emma will be home soon Will will pick up Emma Emma. Emma's not hungry Will Emma and Emma are not worried about the dinner will..

---



  1%|          | 5/819 [03:28<8:00:44, 35.44s/it]

Sample 5:
Actual Summary: Jane is in Warsaw. Ollie and Jane has a party. Jane lost her calendar. They will get a lunch this week on Friday. Ollie accidentally called Jane and talked about whisky. Jane cancels lunch. They'll meet for a tea at 6 pm.
Predicted Summary: Ollie: are you in Warsaw Jane: you are free for diner the 19th and the 18th? Jane's not having any whisky this week Jane has some sun and she'll be there.. Ol has forgotten her calendar.

---



  1%|          | 6/819 [03:58<7:31:44, 33.34s/it]

Sample 6:
Actual Summary: Hilary has the keys to the apartment. Benjamin wants to get them and go take a nap. Hilary is having lunch with some French people at La Cantina. Hilary is meeting them at the entrance to the conference hall at 2 pm. Benjamin and Elliot might join them. They're meeting for the drinks in the evening.
Predicted Summary: I'm going to the conference hall at 2 pm and will meet with French people who are working on the history of food in colonial Mexico. Hilary: I have the keys and we'll meet at the entrance of the hall.

---



  1%|          | 7/819 [04:30<7:27:29, 33.07s/it]

Sample 7:
Actual Summary: Payton provides Max with websites selling clothes. Payton likes browsing and trying on the clothes but not necessarily buying them. Payton usually buys clothes and books as he loves reading.
Predicted Summary: Payton: Know any good sites to buy clothes from? Max: I like to browse and try on clothes, but I don't always buy things from them. Max is looking for good clothes. Pa has a lot of them and they are not all the same.

---



  1%|          | 8/819 [05:03<7:27:48, 33.13s/it]

Sample 8:
Actual Summary: Rita and Tina are bored at work and have still 4 hours left.
Predicted Summary: I'm so tired at work Rita is looking at the keyboard hoping that the boss doesn't notice..a's tired of the bore and she is not fit for the job. Tina: She is bored with work.

---



  1%|          | 9/819 [05:42<7:51:35, 34.93s/it]

Sample 9:
Actual Summary: Beatrice wants to buy Leo a scarf, but he doesn't like scarves. She cares about his health and will buy him a scarf no matter his opinion.
Predicted Summary: Leatrice: I'm in town shopping and they have nice scarfs next to church. Leo: You're complaining about the cold, I don't like them.atrice is going to get a scarf.oo.

---



  1%|          | 10/819 [06:15<7:40:06, 34.12s/it]

Sample 10:
Actual Summary: Eric doesn't know if his parents let him go to Ivan's brother's wedding. Ivan will talk to them.
Predicted Summary: Ivan and Eric are coming to the wedding Eric andvan are going to their brother's wedding.van is to take care of his parents Eric is not going Eric Eric to talk to them Ericvan has to do at home Eric has the guts to speak to him.

---



  1%|▏         | 11/819 [06:44<8:14:39, 36.73s/it]

Sample 11:
Actual Summary: Wanda wants to throw a party. She asks Gina to borrow her father's car and go do groceries together. They set the date for Friday. 
Predicted Summary: a: I want to make party and Gina wants to go with her father to grocery.a's not sure she'll agree. Gin: she will go to the grocery on Friday. she is the best in the world..

---




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


rouge1: 33.97%
rouge2: 8.79%
rougeL: 23.90%
rougeLsum: 23.93%
wandb_summary_metrics: 
 {'rouge1': 33.97, 'rouge2': 8.79, 'rougeL': 23.9, 'rougeLsum': 23.93, 'bertscore_precision': 0.8624995838512074, 'bertscore_recall': 0.8850834532217546, 'bertscore_f1': 0.8735394857146523, 'sacrebleu': 5.221880180976248}
Evaluation completed and logged to wandb.


In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the dataset
dataset = load_dataset("xsum", split="train")

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-large")

# Initialize counter
num_samples_less_than_200_tokens = 0

# Tokenize and count
for example in dataset:
    document_tokens = tokenizer.tokenize(example["document"])
    summary_tokens = tokenizer.tokenize(example["summary"])
    
    if len(document_tokens) < 200 or len(summary_tokens) < 200:
        num_samples_less_than_200_tokens += 1

print(f"Number of samples with texts or summaries less than 200 tokens: {num_samples_less_than_200_tokens}")
# Number of samples with texts or summaries less than 200 tokens: 204045


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors


Number of samples with texts or summaries less than 200 tokens: 204045


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the dataset
dataset = load_dataset("xsum", split="test")

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Initialize counter
num_samples_less_than_200_tokens = 0

# Tokenize and count
for example in dataset:
    document_tokens = tokenizer.tokenize(example["document"])
    summary_tokens = tokenizer.tokenize(example["summary"])
    
    if len(document_tokens) < 200 or len(summary_tokens) < 200:
        print()
        num_samples_less_than_200_tokens += 1

print(f"Number of samples with texts or summaries less than 200 tokens: {num_samples_less_than_200_tokens}")


Token indices sequence length is longer than the specified maximum sequence length for this model (774 > 512). Running this sequence through the model will result in indexing errors


Number of samples with texts or summaries less than 200 tokens: 11334


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Token indices sequence length is longer than the specified maximum sequence length for this model (774 > 512). Running this sequence through the model will result in indexing errors












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































KeyboardInterrupt: 

In [None]:
# import torch
# from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
# from datasets import load_dataset
# from torch.utils.data import DataLoader
# from torch.optim import AdamW
# import numpy as np
# import os
# import wandb

# # Initialize Weights & Biases
# # wandb.init(project="Gigaword_Summarization", entity="your_entity", name="gigaword_stage_dp_epochs-8")

# # Check if CUDA is available and set the device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Set seeds for reproducibility
# torch.manual_seed(42)

# # Load the tokenizer
# tokenizer = T5Tokenizer.from_pretrained("t5-large", model_max_length=512)
# model = T5ForConditionalGeneration.from_pretrained("t5-large").to(device)

# # Define the preprocessing function for the Gigaword dataset
# def preprocess_function(examples):
#     # Adjust these lines according to the Gigaword dataset structure
#     inputs = ["summarize: " + doc for doc in examples["document"]]
#     model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(examples["summary"], max_length=150, padding="max_length", truncation=True)
    
#     labels["input_ids"] = [
#         [(label_id if label_id != tokenizer.pad_token_id else -100) for label_id in label]
#         for label in labels["input_ids"]
#     ]

#     model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"])
#     model_inputs["attention_mask"] = torch.tensor(model_inputs["attention_mask"])
#     model_inputs["labels"] = torch.tensor(labels["input_ids"])
#     return model_inputs

# # Load Gigaword dataset
# dataset = load_dataset("gigaword", split={'train': 'train', 'validation': 'validation'})

# # Preprocess the dataset
# tokenized_datasets = {split: dataset[split].map(preprocess_function, batched=True) for split in dataset}
# tokenized_datasets["train"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# tokenized_datasets["validation"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# # # Prepare DataLoader
# # train_loader = DataLoader(tokenized_datasets["train"], batch_size=8, shuffle=True)
# # val_loader = DataLoader(tokenized_datasets["validation"], batch_size=8)

# # # Example Training Loop (simplified)
# # optimizer = AdamW(model.parameters(), lr=5e-5)

# # model.train()
# # for epoch in range(1, epochs + 1):
# #     for batch in train_loader:
# #         input_ids = batch['input_ids'].to(device)
# #         attention_mask = batch['attention_mask'].to(device)
# #         labels = batch['labels'].to(device)

# #         optimizer.zero_grad()
# #         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
# #         loss = outputs.loss

# #         loss.backward()
# #         optimizer.step()

# #         print(f"Epoch: {epoch}, Loss: {loss.item()}")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading builder script:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.03k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/578M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3803957 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/189651 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1951 [00:00<?, ? examples/s]

Map:   0%|          | 0/3803957 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
dataset['train']

In [None]:
from datasets import load_dataset

# Load the Gigaword dataset
dataset = load_dataset("gigaword", split='train')

# Initialize counters
total_text_length = 0
total_summary_length = 0
num_samples = 0

# Loop through the dataset to accumulate lengths
for example in dataset:
    # Tokenize the text and summary to accurately count the number of tokens
    text_tokens = example['document'].split()  # Assuming the field name is 'document'
    summary_tokens = example['summary'].split()  # Assuming the field name is 'summary'
    
    # Accumulate lengths
    total_text_length += len(text_tokens)
    total_summary_length += len(summary_tokens)
    num_samples += 1

# Calculate averages
avg_text_length = total_text_length / num_samples
avg_summary_length = total_summary_length / num_samples

print(f"Average size of texts: {avg_text_length} tokens")
print(f"Average size of summaries: {avg_summary_length} tokens")


In [1]:
# import torch
# import math
# from tqdm import tqdm
# from transformers import PreTrainedTokenizerFast
# from torch.utils.data import DataLoader
# from torch import nn
# from types import SimpleNamespace as Namespace
# import argparse
# import torch
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# from datasets import load_dataset
# from tqdm import tqdm
# import evaluate
# import os
# import pickle
# import numpy as np


# def preprocess_function(examples, tokenizer, max_input_length=512, max_output_length=150):
#     inputs = ["summarize: " + doc for doc in examples["dialogue"]]
#     model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(examples["summary"], max_length=max_output_length, padding="max_length", truncation=True)

#     return {
#         "input_ids": model_inputs["input_ids"],
#         "attention_mask": model_inputs["attention_mask"],
#         "labels": labels["input_ids"]
#     }

# def evaluate(model: nn.Module, dataloader: DataLoader, tokenizer: PreTrainedTokenizerFast, args: Namespace):
#     model.eval()

#     sample_count = 0
#     # for batch in dataloader:
#     for batch in tqdm(dataloader, desc="Evaluation", total=min(6, len(dataloader))):

#         if sample_count >= 6:  # Stop after processing 11 samples
#             break

#         input_ids = batch['input_ids'].to(args.device)
#         attention_mask = batch['attention_mask'].to(args.device)
#         labels = batch['labels'].to(args.device)

#         # Adjust labels for model configuration
#         labels = labels.masked_fill(labels == model.config.pad_token_id, -100)

#         with torch.no_grad():
#             output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

#             # Generate summaries for the current batch
#             generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask,
#                                         max_length=50, num_beams=5, early_stopping=True)

#             generated_summaries = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            
#             # Filter out -100 values before decoding
#             # Replace -100 with tokenizer.pad_token_id for decoding purposes
#             labels_decodable = labels.clone()
#             labels_decodable[labels_decodable == -100] = tokenizer.pad_token_id
#             actual_summaries = tokenizer.batch_decode(labels_decodable, skip_special_tokens=True)

#             for actual, predicted in zip(actual_summaries, generated_summaries):
#                 print(f"Actual Summary: {actual}")
#                 print(f"Predicted Summary: {predicted}\n")
#                 print("---\n")


#         sample_count += input_ids.size(0)  # Update sample_count based on the batch size
 

# # Assuming 'args' and model setup
# args = Namespace()
# args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model_path = "/users/k/n/kngongiv/Research/private_llm_generation/dialog/stage2/train/gen_model/samsum_stage_dp_1" # Update this to your model's path

# # test_dataset = load_dataset("samsum", split="test")
# test_dataset = load_dataset("samsum", split="test")
# # tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained("t5-large")

# tokenized_test_dataset = test_dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)

# # Since we are working with PyTorch, let's convert our dataset to a PyTorch DataLoader
# def collate_fn(batch):
#     return {
#         'input_ids': torch.stack([torch.tensor(x['input_ids']) for x in batch]),
#         'attention_mask': torch.stack([torch.tensor(x['attention_mask']) for x in batch]),
#         'labels': torch.stack([torch.tensor(x['labels']) for x in batch])
#     }

# dataloader = DataLoader(tokenized_test_dataset, batch_size=8, collate_fn=collate_fn)
# # print(f"\nMODEL: {args['model_path']}\n" )
# print(f"\nMODEL: {model_path}\n" )


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Device: ", device )

# model.to(device)
# # model.cuda()
# model.eval()

# max_input_length = 250
# # max_output_length = 150
# max_output_length = 150

# # metrics_results = evaluate_model(model, tokenizer, test_dataset, max_input_length, max_output_length)

# # Example call to the modified evaluate function
# evaluate(model, dataloader, tokenizer, args)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.



MODEL: /users/k/n/kngongiv/Research/private_llm_generation/dialog/stage2/train/gen_model/samsum_stage_dp

Device:  cpu


Evaluation:  17%|█▋        | 1/6 [12:44<1:03:40, 764.13s/it]

Actual Summary: Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
Predicted Summary: Amanda's number last time we were at the park together Larry called Betty's number Amanda asked Larry last time they were at the park together.

---

Actual Summary: Eric and Rob are going to watch a stand-up on youtube.
Predicted Summary: I'm watching some of his stand-ups on youtube. Eric: Machine, Rob, and the machine.

---

Actual Summary: Lenny can't decide which trousers to buy. Bob advised Lenny on that topic. Lenny goes with Bob's advice to pick the trousers that are of best quality.
Predicted Summary: Leny: Babe, can you help me with something? Leny: I have purple pants. Leny: I have purple pants.

---

Actual Summary: Emma will be home soon and she will let Will know.
Predicted Summary: Will: what do you want for dinner tonight Emma: what do you want for dinner tonight Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma Emma w




In [None]:
11

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Starting training from the pretrained model


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Starting training from the pretrained model


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

ValueError: Columns ['input_ids', 'attention_mask', 'labels'] not in the dataset. Current columns in the dataset: ['document', 'summary', 'id']

In [3]:
# from datasets import load_dataset
# from transformers import AutoTokenizer

# # Load the dataset
# dataset = load_dataset("xsum", split="train")

# # Initialize the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("t5-large")

# # Initialize counter
# num_samples_less_than_200_tokens = 0

# # Tokenize and count
# for example in dataset:
#     document_tokens = tokenizer.tokenize(example["document"])
#     summary_tokens = tokenizer.tokenize(example["summary"])
    
#     if len(document_tokens) < 200 or len(summary_tokens) < 200:
#         num_samples_less_than_200_tokens += 1

# print(f"Number of samples with texts or summaries less than 200 tokens: {num_samples_less_than_200_tokens}")
# # Number of samples with texts or summaries less than 200 tokens: 204045


In [15]:
from datasets import load_dataset
import re  # Regular expression library for more accurate word splitting

def count_words(text):
    # This function counts words in a given text. You can adjust the regex pattern for different criteria of what constitutes a word.
    return len(re.findall(r'\w+', text))

def count_short_samples(batch):
    global num_samples_less_than_200_words
    for doc, summary in zip(batch["document"], batch["summary"]):
        doc_word_count = count_words(doc)
        summary_word_count = count_words(summary)
        
        if doc_word_count < 300 : #or summary_word_count < 100:
            num_samples_less_than_200_words += 1

    return batch

# Load the dataset
dataset = load_dataset("xsum", split="train")

# Initialize counter
num_samples_less_than_200_words = 0

# Apply the counting function
dataset.map(count_short_samples, batched=True, batch_size=100)

print(f"Number of samples with documents or summaries less than 200 words: {num_samples_less_than_200_words}")


Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Number of samples with documents or summaries less than 200 words: 100823


In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer

def count_short_samples(batch):
    global num_samples_less_than_200_tokens
    
    # Iterate through each example in the batch
    for doc, summary in zip(batch["document"], batch["summary"]):
        # Tokenize without padding and not converting to tensors
        document_tokens = tokenizer(doc, truncation=True, max_length=512, padding=False)
        summary_tokens = tokenizer(summary, truncation=True, max_length=512, padding=False)
        
        # Check the length of the tokenized input_ids directly
        if len(document_tokens["input_ids"]) <= 300 : #or len(summary_tokens["input_ids"]) < 200:
        
            num_samples_less_than_200_tokens += 1

    return batch

# Load the dataset
dataset = load_dataset("xsum", split="train")

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-large")

# Initialize counter
num_samples_less_than_200_tokens = 0

# Apply the counting function
dataset.map(count_short_samples, batched=True, batch_size=100)

print(f"Number of samples with texts or summaries less than 200 tokens: {num_samples_less_than_200_tokens}")


Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Number of samples with texts or summaries less than 200 tokens: 67470


In [1]:
# from datasets import load_dataset
# from transformers import AutoTokenizer

# def count_short_samples(batch):
#     global num_samples_less_than_200_tokens
#     # document_tokens = tokenizer(batch["document"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
#     # summary_tokens = tokenizer(batch["summary"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
#     document_tokens = tokenizer(batch["document"], truncation=True, max_length=512, return_tensors="pt")
#     summary_tokens = tokenizer(batch["summary"], truncation=True, max_length=512, return_tensors="pt")
    
#     for i in range(len(batch["document"])):
#         print(document_tokens.input_ids.shape)
#         print(summary_tokens.input_ids.shape)
#         print("\n")
#         if document_tokens.input_ids.shape[1] < 200 or summary_tokens.input_ids.shape[1] < 200:
#             num_samples_less_than_200_tokens += 1

#     return batch

# # Load the dataset
# dataset = load_dataset("xsum", split="train")

# # Initialize the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("t5-large")

# # Initialize counter
# num_samples_less_than_200_tokens = 0

# # Apply the counting function
# dataset.map(count_short_samples, batched=True, batch_size=100)

# print(f"Number of samples with texts or summaries less than 200 tokens: {num_samples_less_than_200_tokens}")


In [10]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

from datasets import load_dataset
from torch.utils.data import DataLoader
from private_transformers import PrivacyEngine
import torch.nn.functional as F
# from transformers import AdamW, get_linear_schedule_with_warmup
# from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim import AdamW  
import numpy as np
import os
import wandb
import spacy

# Load the SpaCy model for sentence tokenization
spacy_model = spacy.load("en_core_web_sm")

from torch.nn.utils.rnn import pad_sequence

# Checks if the document in a single example has 300 or fewer words
def word_count_filter(example):
    return len(example["document"].split()) <= 300

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding="max_length")

    # Ensure labels are prepared for loss calculation
    labels["input_ids"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]

    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels["input_ids"]
    }


def find_latest_checkpoint(checkpoint_dir):
    checkpoint_subdirs = [d for d in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, d))]
    if not checkpoint_subdirs:
        return None
    latest_checkpoint = max(checkpoint_subdirs, key=lambda d: int(d.split('-')[-1]))
    return os.path.join(checkpoint_dir, latest_checkpoint)

def compute_average_length(dataset):
    lengths = dataset["lengths"]  
    average_length = np.mean(lengths)
    print(f"Average token length: {average_length}")

# wandb.init(project="XSum_Summarization", entity="ivolinengong", name="xsum_dp")


# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds for reproducibility
torch.manual_seed(42)

# Load the tokenizer
max_input_length = 300 #1024
max_output_length = 100 #256 #250
batch_size = 4
epochs = 5
gradient_accumulation_steps = 32 #10
target_epsilon = 8

# Load XSum dataset
train_dataset = load_dataset("xsum", split="train")
tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large").to(device)

# Checkpoint directories
# checkpoint_dir = f"./dp_results/{wandb.run.name}"
checkpoint_dir = "." #f"./{wandb.run.name}"
os.makedirs(checkpoint_dir, exist_ok=True)

model_tokenizer_path = checkpoint_dir
optimizer_checkpoint_path = os.path.join(checkpoint_dir, "optimizer_and_loss.pth")

# Load model and tokenizer, and optimizer state if available
# model_checkpoint = find_latest_checkpoint(checkpoint_dir)
# if model_checkpoint:
#     print(f"Resuming from checkpoint: {model_checkpoint}")

#     # Load model and tokenizer from the checkpoint
#     model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
#     tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, model_max_length=max_input_length)

#     # Load optimizer state
#     if os.path.isfile(optimizer_checkpoint_path):
#         print("Loading optimizer state and best loss from checkpoint")
#         checkpoint = torch.load(optimizer_checkpoint_path, map_location=device)
#         optimizer = AdamW(model.parameters(), lr=5e-5)
#         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#         best_loss = checkpoint['best_loss']
# else:
print("Starting training from the pretrained model")
model = T5ForConditionalGeneration.from_pretrained("t5-large").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-large", model_max_length=max_input_length)

# model = AutoModelForSeq2SeqLM.from_pretrained("t5-large").to(device)
# tokenizer = AutoTokenizer.from_pretrained("t5-large", model_max_length=max_input_length)
optimizer = AdamW(model.parameters(), lr=5e-5)
best_loss = float('inf')

# tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
# tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


# Apply the filter function to the dataset
train_dataset = load_dataset("xsum", split="train")
filtered_train_dataset = train_dataset.filter(word_count_filter)
# Apply preprocessing to the filtered dataset
tokenized_train_dataset = filtered_train_dataset.map(preprocess_function, batched=True)
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print("\nTraining dataset size:", len(tokenized_train_dataset))


# # Inspect a sample
sample = tokenized_train_dataset[0]
print(f"Sample keys: {sample.keys()}")
print(f"Input IDs shape: {sample['input_ids'].shape}")
print(f"Attention mask shape: {sample['attention_mask'].shape}")
print(f"Labels shape: {sample['labels'].shape}")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Starting training from the pretrained model


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Filter:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/104101 [00:00<?, ? examples/s]


Training dataset size: 104101
Sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs shape: torch.Size([512])
Attention mask shape: torch.Size([512])
Labels shape: torch.Size([150])


In [11]:
len(train_dataset) #104101

204045

In [14]:
# def word_count_filter(examples):
#     # Returns True for examples with 300 or fewer words
#     return [len(doc.split()) <= 300 for doc in examples["document"]]

def word_count_filter(example):
    return len(example["document"].split()) <= 300

train_dataset = load_dataset("xsum", split="test")

filtered_train_dataset = train_dataset.filter(word_count_filter,)
len(train_dataset), len(filtered_train_dataset)

#########

# # 204017 # 104101
# # Checks if the document in a single example has 300 or fewer words
# def word_count_filter(example):
#     return len(example["document"].split()) <= 300



(11334, 5777)

In [None]:

# Calculate and print average token length
# compute_average_length(tokenized_train_dataset)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Attach the privacy engine to the optimizer
privacy_engine = PrivacyEngine(
    model,
    batch_size=batch_size,
    sample_size=len(tokenized_train_dataset),
    epochs=epochs,
    max_grad_norm=1.0,
    target_epsilon=target_epsilon,
    noise_multiplier=0.1,
    # clipping_mode="ghost",
)
privacy_engine.attach(optimizer)

# Learning rate scheduler
total_steps = len(tokenized_train_dataset) // batch_size * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Data loader
data_loader = DataLoader(tokenized_train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    print(f"\nStarting epoch {epoch + 1}/{epochs}", flush=True)
    for i, batch in enumerate(data_loader):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        # Compute per-example loss
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss = F.cross_entropy(shift_logits.permute(0, 2, 1), shift_labels, reduction="none").mean(dim=1)

        # Gradient accumulation
        if (i + 1) % gradient_accumulation_steps == 0:
            optimizer.step(loss=loss)  # Perform parameter update
            optimizer.zero_grad()  # Reset gradients
            scheduler.step()
        else:
            optimizer.virtual_step(loss=loss)  # Accumulate gradients


        # Save the model if this is the best loss so far
        avg_loss = torch.mean(loss).item()
        total_loss += avg_loss

        # Check if the current loss is the best
        if avg_loss < best_loss:
            best_loss = avg_loss

            # Save model and tokenizer using save_pretrained
            model.save_pretrained(checkpoint_dir)
            tokenizer.save_pretrained(checkpoint_dir)

            # Save optimizer state and best_loss separately
            torch.save({
                'optimizer_state_dict': optimizer.state_dict(),
                'best_loss': best_loss,
            }, optimizer_checkpoint_path)

            print(f"New best model saved with loss: {best_loss}")

        # # Log progress
        if i % 10 == 0:
            print(f"Epoch: {epoch + 1}, Step: {i}/{len(data_loader)}, Loss: {avg_loss}", flush=True)

    # Calculate the average training loss and perplexity for the epoch
    avg_train_loss = total_loss / len(data_loader)
    train_perplexity = np.exp(avg_train_loss)
#     wandb.log({"epoch": epoch + 1, "train_loss": avg_train_loss, "train_perplexity": train_perplexity})

# wandb.finish()


In [1]:
from transformers import AutoModel

model_name = "/users/k/n/kngongiv/Research/clean/private_llm_generation/stage_dp/models/xsum_stagedp_epsilons_8"
model = AutoModel.from_pretrained(model_name)
model_size = sum(p.numel() for p in model.parameters())
print(f"Model size (number of parameters): {model_size}")


Model size (number of parameters): 737668096


In [2]:
from transformers import AutoModel

model_name = "/users/k/n/kngongiv/Research/private_llm_generation/dialog/stage_dp/models/xsum_stage_dp"
model = AutoModel.from_pretrained(model_name)
model_size = sum(p.numel() for p in model.parameters())
print(f"Model size (number of parameters): {model_size}")

# 737668096
# 737668096

Model size (number of parameters): 737668096


In [8]:
# %cd /users/k/n/kngongiv/Research/clean/private_llm_generation/stage2/generate_and_filter
# !python main_output.py --device_id=0 --con_domain='news' --con_model_name='xsum_stagedp_epsilons_8' --shard_start=0 --shard_size=100031
