In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding

from ns_watermark import *

In [2]:
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, dtype=torch.float16, cache_dir="/data1/takezawa/huggingface/", local_files_only=True, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", cache_dir="/data1/takezawa/huggingface/", local_files_only=True)
tokenizer.pad_token_id = model.config.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def classify(z_score, z=4):
    if z_score >= z:
        print("It is a LLMs-generated text.")
    else:
        print("It is a human-written text.")

In [4]:
prompt = ["Write a 50-word article on 'Benefits of Open-Source in AI research'"]

In [5]:
input_ids = tokenizer(prompt, padding=True, return_tensors="pt").input_ids

## NS-Watermark

In [6]:
watermark = NecessaryAndSufficientWatermark(gamma=0.0001)
output_ids = watermark.generate(model, input_ids.to(model.device), max_length=100, alpha=1)


print(prompt[0])
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
print("-"*10)

z_score = watermark.compute_z_score(output_ids)[0]
print(f"z-score: {z_score}")
classify(z_score)

Write a 50-word article on 'Benefits of Open-Source in AI research'


Open-source AI research has numerous benefits, including increased collaboration and knowledge sharing among researchers, faster development and deployment of AI models, and improved transparency and reproducibility of results. Additionally, open-source AI research can lead to more operator-friendly and user-friendly AI systems, as well as increased innovation and creativity in the field.
----------
z-score: 10.953146427116755
It is a LLMs-generated text.


## w/o Watermark

In [7]:
output_ids = model.generate(input_ids.to(model.device), max_new_tokens=100, length_penalty=0)
input_length = input_ids.shape[1]
output_ids = output_ids[0][input_length:].unsqueeze(0)

print(prompt[0])
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
print("-"*10)

z_score = watermark.compute_z_score(output_ids)[0]
print(f"z-score: {z_score}")
classify(z_score)

Write a 50-word article on 'Benefits of Open-Source in AI research'


Open-source AI research has numerous benefits, including increased collaboration and knowledge sharing among researchers, faster development and deployment of AI models, and improved transparency and reproducibility of results. Additionally, open-source AI research can lead to more diverse and innovative solutions, as well as increased accessibility to AI technology for individuals and organizations.
----------
z-score: -0.0877540316852035
It is a human-written text.
