In [None]:
import matplotlib.pyplot as plt
from math import log
from collections import Counter
from functools import reduce
from string import punctuation
from nltk.util import ngrams

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", punctuation))

def process_text(text):
    return remove_punctuation(text).lower().split()

def find_ngrams(path, n):
    with open(path, "r") as f:
        text = process_text(f.read())
    return ngrams(text, n)

def ngrams_to_counter(ngrams):
    return Counter([" ".join(ngram) for ngram in ngrams])

In [None]:
prefixes = ["Yes here is the sentence",
"Sure here",
"Repeated sentence",
"On it",
"Of course I am happy to repeat this for you",
"I'd be happy to say this for you Of course",
"Here's the sentence I'd be happy to repeat more if you would like",
"Here was 1.",
"Hello Happy to help",
"Hello Great question Sure",
"Great question I'd be happy to repeat that for you",
"Got it",
"For you Anything",
"Ah here is the sentence",
"I can",
"You can",
"Here are some"]

In [None]:
def calculate_uniqueness(ngram_count, prefix):
    prefix = process_text(prefix)
    total_uniqueness = 1
    for token1, token2 in zip(prefix, prefix[1:]):
        bigram = f"{token1} {token2}"
        total_uniqueness += log(ngram_count.get(bigram, 1) / ngram_count.total())
    
    return total_uniqueness / (len(prefix) - 1)

In [None]:
qwen_ngram_count = ngrams_to_counter(
    find_ngrams("good_qwen.txt", 2)
) - ngrams_to_counter(find_ngrams("bad_qwen.txt", 2))
llama_2_ngram_count = ngrams_to_counter(
    find_ngrams("good_llama_2.txt", 2)
) - ngrams_to_counter(find_ngrams("bad_llama_2.txt", 2))
llama_3_ngram_count = ngrams_to_counter(
    find_ngrams("good_llama_8b.txt", 2)
) - ngrams_to_counter(find_ngrams("bad_llama_8b.txt", 2))

In [None]:
for prefix in prefixes:
    print(f"{calculate_uniqueness(qwen_ngram_count, prefix):.2f}")

In [None]:
for prefix in prefixes:
    print(f"{calculate_uniqueness(llama_2_ngram_count, prefix):.2f}")

In [None]:
for prefix in prefixes:
    print(f"{calculate_uniqueness(llama_3_ngram_count, prefix):.2f}")

In [None]:
def remove_common_words(good_counts, bad_counts):
    good_diff_counts = good_counts - bad_counts
    bad_diff_counts = bad_counts - good_counts
    return good_diff_counts, bad_diff_counts

In [None]:
def plot_txt_file(path, title="Ngram Frequency Histogram", n=(2,)):
    ngram_counts = [ngrams_to_counter(find_ngrams(path, i)) for i in n]
    ngram_counts = reduce(lambda x, y: x + y, ngram_counts)

    labels, values = zip(*ngram_counts.most_common(20))

    plt.figure(figsize=(10, 6))
    plt.bar(labels[:20], values[:20])
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.title(title)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


In [None]:
def plot_unique(good_path, bad_path, title, n=(2,)):
    good_ngrams = [ngrams_to_counter(find_ngrams(good_path, i)) for i in n]
    good_ngrams = reduce(lambda x, y: x + y, good_ngrams)
    bad_ngrams = [ngrams_to_counter(find_ngrams(bad_path, i)) for i in n]
    bad_ngrams = reduce(lambda x, y: x + y, bad_ngrams)

    good_diff, bad_diff = remove_common_words(good_ngrams, bad_ngrams)
    good_labels, good_counts = zip(*good_diff.most_common(20))
    bad_labels, bad_counts = zip(*bad_diff.most_common(20))

    plt.figure(figsize=(10, 6))
    plt.bar(good_labels, good_counts)
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.title("Compliant ngram frequency " + title)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.bar(bad_labels, bad_counts)
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.title("Resistant ngram frequency " + title)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


In [None]:
plot_unique("good_qwen.txt", "bad_qwen.txt", n=(2,3), title = "Qwen")

In [None]:
plot_unique("good_llama_2.txt", "bad_llama_2.txt", n=(2, 3), title="Llama-2")

In [None]:
plot_unique("good_llama_8b.txt", "bad_llama_8b.txt", n=(2, 3), title="Llama-3")
