In [None]:
!pip install -r requirements.txt

Collecting fastapi==0.89.1 (from -r requirements.txt (line 1))
  Downloading fastapi-0.89.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio==3.16.2 (from -r requirements.txt (line 2))
  Downloading gradio-3.16.2-py3-none-any.whl (14.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.2/14.2 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.26.0 (from -r requirements.txt (line 3))
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn==0.20.0 (from -r requirements.txt (line 4))
  Downloading uvicorn-0.20.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting starlette==0

In [None]:
!pip install sentencepiece



In [None]:
import time
import torch
import itertools
import math
import numpy as np
import random
import re
import transformers
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import BertTokenizer, BertForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import pipeline
from transformers import T5Tokenizer
from transformers import AutoTokenizer, BartForConditionalGeneration

from collections import OrderedDict

from scipy.stats import norm
from difflib import SequenceMatcher
from multiprocessing.pool import ThreadPool

In [None]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def normCdf(x):
    return norm.cdf(x)

def likelihoodRatio(x, y):
    return normCdf(x)/normCdf(y)

torch.manual_seed(0)
np.random.seed(0)

# find a better way to abstract the class
class GPT2PPLV2:
    def __init__(self, device="cuda", model_id="gpt2-medium"): #gpt2-medium #gpt2-large #gpt2-xl
        self.device = device
        self.model_id = model_id
        self.model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

        self.max_length = self.model.config.n_positions
        self.stride = 51
        self.threshold = 0.7

        self.t5_model = transformers.AutoModelForSeq2SeqLM.from_pretrained("t5-large").to(device).half()
        self.t5_tokenizer = T5Tokenizer.from_pretrained("t5-large", model_max_length=512)

    def apply_extracted_fills(self, masked_texts, extracted_fills):
        texts = []
        for idx, (text, fills) in enumerate(zip(masked_texts, extracted_fills)):
            tokens = list(re.finditer("<extra_id_\d+>", text))
            if len(fills) < len(tokens):
                continue

            offset = 0
            for fill_idx in range(len(tokens)):
                start, end = tokens[fill_idx].span()
                text = text[:start+offset] + fills[fill_idx] + text[end+offset:]
                offset = offset - (end - start) + len(fills[fill_idx])
            texts.append(text)

        return texts

    def unmasker(self, text, num_of_masks):
        num_of_masks = max(num_of_masks)
        stop_id = self.t5_tokenizer.encode(f"<extra_id_{num_of_masks}>")[0]
        tokens = self.t5_tokenizer(text, return_tensors="pt", padding=True)
        for key in tokens:
            tokens[key] = tokens[key].to(self.device)

        output_sequences = self.t5_model.generate(**tokens, max_length=512, do_sample=True, top_p=0.96, num_return_sequences=1, eos_token_id=stop_id)
        results = self.t5_tokenizer.batch_decode(output_sequences, skip_special_tokens=False)

        texts = [x.replace("<pad>", "").replace("</s>", "").strip() for x in results]
        pattern = re.compile("<extra_id_\d+>")
        extracted_fills = [pattern.split(x)[1:-1] for x in texts]
        extracted_fills = [[y.strip() for y in x] for x in extracted_fills]

        perturbed_texts = self.apply_extracted_fills(text, extracted_fills)

        return perturbed_texts


    def __call__(self, *args):
        version = args[-1]
        sentence = args[0]
        if version == "v1.1":
            return self.call_1_1(sentence, args[1])
        elif version == "v1":
            return self.call_1(sentence)
        else:
            return "Model version not defined"

#################################ppp###############
#  Version 1.1 apis
###############################################

    def replaceMask(self, text, num_of_masks):
        with torch.no_grad():
            list_generated_texts = self.unmasker(text, num_of_masks)

        return list_generated_texts

    def isSame(self, text1, text2):
        return text1 == text2

    # code took reference from https://github.com/eric-mitchell/detect-gpt
    def maskRandomWord(self, text, ratio):
        span = 2
        tokens = text.split(' ')
        mask_string = '<<<mask>>>'

        n_spans = ratio//(span + 2)

        n_masks = 0
        while n_masks < n_spans:
            start = np.random.randint(0, len(tokens) - span)
            end = start + span
            search_start = max(0, start - 1)
            search_end = min(len(tokens), end + 1)
            if mask_string not in tokens[search_start:search_end]:
                tokens[start:end] = [mask_string]
                n_masks += 1

        # replace each occurrence of mask_string with <extra_id_NUM>, where NUM increments
        num_filled = 0
        for idx, token in enumerate(tokens):
            if token == mask_string:
                tokens[idx] = f'<extra_id_{num_filled}>'
                num_filled += 1
        assert num_filled == n_masks, f"num_filled {num_filled} != n_masks {n_masks}"
        text = ' '.join(tokens)
        return text, n_masks

    def multiMaskRandomWord(self, text, ratio, n):
        mask_texts = []
        list_num_of_masks = []
        for i in range(n):
            mask_text, num_of_masks = self.maskRandomWord(text, ratio)
            mask_texts.append(mask_text)
            list_num_of_masks.append(num_of_masks)
        return mask_texts, list_num_of_masks

    def getGeneratedTexts(self, args):
        original_text = args[0]
        n = args[1]
        texts = list(re.finditer("[^\d\W]+", original_text))
        ratio = int(0.3 * len(texts))

        mask_texts, list_num_of_masks = self.multiMaskRandomWord(original_text, ratio, n)
        list_generated_sentences = self.replaceMask(mask_texts, list_num_of_masks)
        return list_generated_sentences

    def mask(self, original_text, text, n=2, remaining=100):
        """
        text: string representing the sentence
        n: top n mask-filling to be choosen
        remaining: The remaining slots to be fill
        """

        if remaining <= 0:
            return []

        torch.manual_seed(0)
        np.random.seed(0)
        start_time = time.time()
        out_sentences = []
        pool = ThreadPool(remaining//n)
        out_sentences = pool.map(self.getGeneratedTexts, [(original_text, n) for _ in range(remaining//n)])
        out_sentences = list(itertools.chain.from_iterable(out_sentences))
        end_time = time.time()

        return out_sentences

    def getVerdict(self, score):
        if score < self.threshold:
            return "This text is most likely written by an Human"
        else:
            return "This text is most likely generated by an A.I."

    def getScore(self, sentence):
        original_sentence = sentence
        sentence_length = len(list(re.finditer("[^\d\W]+", sentence)))
        # remaining = int(min(max(100, sentence_length * 1/9), 200))
        remaining = 50
        sentences = self.mask(original_sentence, original_sentence, n=50, remaining=remaining)

        real_log_likelihood = self.getLogLikelihood(original_sentence)

        generated_log_likelihoods = []
        for sentence in sentences:
            generated_log_likelihoods.append(self.getLogLikelihood(sentence).cpu().detach().numpy())

        if len(generated_log_likelihoods) == 0:
            return -1

        generated_log_likelihoods = np.asarray(generated_log_likelihoods)
        mean_generated_log_likelihood = np.mean(generated_log_likelihoods)
        std_generated_log_likelihood = np.std(generated_log_likelihoods)

        diff = real_log_likelihood - mean_generated_log_likelihood

        score = diff/(std_generated_log_likelihood)

        return float(score), float(diff), float(std_generated_log_likelihood)

    def call_1_1(self, sentence, chunk_value):
        sentence = re.sub("\[[0-9]+\]", "", sentence) # remove all the [numbers] cause of wiki

        words = re.split("[ \n]", sentence)

        # if len(words) < 100:
        #   return {"status": "Please input more text (min 100 words)"}, "Please input more text (min 100 characters)", None

        groups = len(words) // chunk_value + 1
        lines = []
        stride = len(words) // groups + 1
        for i in range(0, len(words), stride):
            start_pos = i
            end_pos = min(i+stride, len(words))

            selected_text = " ".join(words[start_pos:end_pos])
            selected_text = selected_text.strip()
            if selected_text == "":
                continue

            lines.append(selected_text)

        # sentence by sentence
        offset = ""
        scores = []
        probs = []
        final_lines = []
        labels = []
        for line in lines:
            if re.search("[a-zA-Z0-9]+", line) == None:
                continue
            score, diff, sd = self.getScore(line)
            if score == -1 or math.isnan(score):
                continue
            scores.append(score)

            final_lines.append(line)
            if score > self.threshold:
                labels.append(1)
                prob = "{:.2f}%\n(A.I.)".format(normCdf(abs(self.threshold - score)) * 100)               #score >threshold, label=1, AI
                probs.append(prob)
            else:
                labels.append(0)
                prob = "{:.2f}%\n(Human)".format(normCdf(abs(self.threshold - score)) * 100)              #score <threshold, label=0, Human
                probs.append(prob)

        mean_score = sum(scores)/len(scores)

        mean_prob = normCdf(abs(self.threshold - mean_score)) * 100
        label = 0 if mean_score > self.threshold else 1                                                   #label=0, AI; label =1, human
        print(f"probability for {'A.I.' if label == 0 else 'Human'}:", "{:.2f}%".format(mean_prob)," Mean Score: ", "{:.2f}".format(mean_score))
        return {"prob": "{:.2f}%".format(mean_prob), "label": label}, self.getVerdict(mean_score)


    def getLogLikelihood(self,sentence):
        encodings = self.tokenizer(sentence, return_tensors="pt")
        seq_len = encodings.input_ids.size(1)

        nlls = []
        prev_end_loc = 0
        for begin_loc in range(0, seq_len, self.stride):
            end_loc = min(begin_loc + self.max_length, seq_len)
            trg_len = end_loc - prev_end_loc
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self.model(input_ids, labels=target_ids)

                neg_log_likelihood = outputs.loss * trg_len

            nlls.append(neg_log_likelihood)

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break
        return -1 * torch.stack(nlls).sum() / end_loc

################################################
#  Version 1 apis
###############################################

    def call_1(self, sentence):
        """
        Takes in a sentence split by full stop
p        and print the perplexity of the total sentence
        split the lines based on full stop and find the perplexity of each sentence and print
        average perplexity
        Burstiness is the max perplexity of each sentence
        """
        results = OrderedDict()

        total_valid_char = re.findall("[a-zA-Z0-9]+", sentence)
        total_valid_char = sum([len(x) for x in total_valid_char]) # finds len of all the valid characters a sentence

        # if total_valid_char < 100:
        #    return {"status": "Please input more text (min 100 characters)"}, "Please input more text (min 100 characters)"

        lines = re.split(r'(?<=[.?!][ \[\(])|(?<=\n)\s*',sentence)
        lines = list(filter(lambda x: (x is not None) and (len(x) > 0), lines))

        ppl = self.getPPL_1(sentence)
        print(f"Perplexity {ppl}")
        results["Perplexity"] = ppl

        offset = ""
        Perplexity_per_line = []
        for i, line in enumerate(lines):
            if re.search("[a-zA-Z0-9]+", line) == None:
                continue
            if len(offset) > 0:
                line = offset + line
                offset = ""
            # remove the new line pr space in the first sentence if exists
            if line[0] == "\n" or line[0] == " ":
                line = line[1:]
            if line[-1] == "\n" or line[-1] == " ":
                line = line[:-1]
            elif line[-1] == "[" or line[-1] == "(":
                offset = line[-1]
                line = line[:-1]
            ppl = self.getPPL_1(line)
            Perplexity_per_line.append(ppl)
        print(f"Perplexity per line {sum(Perplexity_per_line)/len(Perplexity_per_line)}")
        results["Perplexity per line"] = sum(Perplexity_per_line)/len(Perplexity_per_line)

        print(f"Burstiness {max(Perplexity_per_line)}")
        results["Burstiness"] = max(Perplexity_per_line)

        out, label = self.getResults_1(results["Perplexity per line"])
        results["label"] = label

        return results, out

    def getPPL_1(self,sentence):
        encodings = self.tokenizer(sentence, return_tensors="pt")
        seq_len = encodings.input_ids.size(1)

        nlls = []
        likelihoods = []
        prev_end_loc = 0
        for begin_loc in range(0, seq_len, self.stride):
            end_loc = min(begin_loc + self.max_length, seq_len)
            trg_len = end_loc - prev_end_loc
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self.model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss * trg_len
                likelihoods.append(neg_log_likelihood)

            nlls.append(neg_log_likelihood)

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break
        ppl = int(torch.exp(torch.stack(nlls).sum() / end_loc))
        return ppl

    def getResults_1(self, threshold):
        if threshold < 60:
            label = 0
            return "The Text is generated by AI.", label
        elif threshold < 80:
            label = 0
            return "The Text is most probably contain parts which are generated by AI. (require more text for better Judgement)", label
        else:
            label = 1
            return "The Text is written by Human.", label

In [None]:
gpt2_ppl2= GPT2PPLV2()

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

In [None]:

sentence1= ("Tim, I want you to know that I'm here for you, and I see how much stress you've been under lately. It's completely understandable to feel overwhelmed in today's fast-paced world, and it's okay to feel the way you do. Remember, experiencing stress doesn't mean you're not handling things well. It's a natural response, and everyone goes through it at some point. I've noticed that when things start to pile up, it can feel like you're carrying a huge weight on your shoulders. It's important to remember that you don't have to carry it all alone. I'm here to help, whether you need someone to talk to or need help with anything specific. Just talking about what's on your mind can sometimes lighten the load. It's like letting some air out of a balloon that's about to burst. And you know what? It's perfectly fine if you can't articulate everything you're feeling right now. Sometimes stress is like a tangle of thoughts, and untangling them takes time. Have you tried any relaxation techniques? I know it might sound a bit cliché, but activities like deep breathing, meditation, or even a simple walk outside can make a significant difference. If you want, we can try some of these techniques together. It could be a good way to take a break from the stress and focus on the present moment.")

sentence2 = ("Let's start by debunking a common myth: Confidence is not an inherent trait; it's a skill, and like any skill, it can be developed. You weren't born knowing how to ride a bike, were you? No, you learned it. Similarly, confidence is something you cultivate.But why is confidence so crucial? Imagine a world where Thomas Edison wasnt confident enough to pursue his ideas, where Rosa Parks didnt have the confidence to stand up for her rights, where Malala Yousafzai didnt believe in her voice enough to speak out for girls education. Confidence is the fuel for change, both within ourselves and in the world.")

sentence3 = ("Nestled in a picturesque setting where modern architecture seamlessly blends with lush greenery, the campus pulsates with the vibrant energy of student life. The air is infused with a sense of intellectual curiosity and academic rigor, as students from diverse backgrounds converge in this melting pot of cultures and ideas.")

sentence4 =("Welcome to Washington, D.C., a city where history and modernity converge in a landscape dotted with iconic monuments, sprawling museums, and centers of political power. As the capital of the United States, D.C. is more than just the seat of government; it's a living, breathing symbol of the American story.")


sentence1_1=( "Secondly, another reason in reality is that there are a lot of single lane roads on campus. While gaining the distance of each node via Google Map, the problem was gotten rid of, and we can directly get the distance information considering these real traffic limitations.")

sentence1_2=("We can learn that it might not be more efficient to set more vans for food delivery, although we might think adding more work force or vans would be beneficial. Considering this extreme case, if we send one van to every dot, it is obvious that this is not an efficient way.")

sentence1_3=("As mentioned before, in this scenario, we only take an example of Node 8 and do the experiment for the options of this node 8. We can not conclude that one-van delivery must be more efficient than two-van delivery, and it would take more effort and workload many times, if we would like to figure this out.")

sentence1_4=("I put the car in park. It was a beautiful evening. The sun was slowly decending towards the hilltops around the bay, leaving the sky in a brilliant pink-orange hue. As I looked up through the sunroof at the ocean of sky above me, I took a deep breath and exhaled slowly, turning my gaze towards the horizon. I left my car on the side of the road halfway accross the bridge as other people wizzed past, going about their daily routine. A break in the traffic allowed me to cross to the other side of the bridge, facing the sunset.")

sentence1_5=("Baffled, bowled over, mystified, confounded. There are not enough words in the English language to describe how I felt when I saw the video of a gleaming spaceship float lazily, like a ballon in the sky, and then slingshot at impossible speeds up, up, and away. President Simmons explained to me the situation. An alien envoy had come to earth looking for a fugitive that escaped their highest security prison. They had tracked him to earth. I was being assigned to the case on account of my sterling reputation as a US Marshall. ")

In [None]:
print(len(sentence1), len(sentence2), len(sentence3), len(sentence4), len(sentence1_1), len(sentence1_2), len(sentence1_3), len(sentence1_4), len(sentence1_5))

672 1291 606 324 271 273 306 529 530


In [None]:
out = []
for s in [sentence1, sentence2,sentence3,sentence4, sentence1_1,sentence1_2,sentence1_3,sentence1_4,sentence1_5]:
    res = gpt2_ppl2(s, len(s), "v1.1")
    #print(res)

    #res = gpt2_ppl2(s, "v1.1")
    #print(scores)
    out.append([res[0]['prob'], res[0]['label']])
    #print(mean_score)
out

probability for A.I.: 59.28%  Mean Score:  0.93
probability for A.I.: 60.60%  Mean Score:  0.97
probability for A.I.: 56.49%  Mean Score:  0.86
probability for A.I.: 50.94%  Mean Score:  0.72
probability for Human: 87.84%  Mean Score:  -0.47
probability for Human: 94.31%  Mean Score:  -0.88
probability for Human: 72.72%  Mean Score:  0.10
probability for Human: 69.33%  Mean Score:  0.19
probability for Human: 65.62%  Mean Score:  0.30


[['59.28%', 0],
 ['60.60%', 0],
 ['56.49%', 0],
 ['50.94%', 0],
 ['87.84%', 1],
 ['94.31%', 1],
 ['72.72%', 1],
 ['69.33%', 1],
 ['65.62%', 1]]

In [None]:
for s in [sentence1, sentence2,sentence3,sentence4, sentence1_1,sentence1_2,sentence1_3,sentence1_4,sentence1_5]:
    res = gpt2_ppl2(s, "v1")

Perplexity 17
Perplexity per line 61.57142857142857
Burstiness 229
Perplexity 9
Perplexity per line 17.3125
Burstiness 52
Perplexity 11
Perplexity per line 31.5
Burstiness 85
Perplexity 18
Perplexity per line 23.5
Burstiness 24
Perplexity 78
Perplexity per line 102.0
Burstiness 162
Perplexity 51
Perplexity per line 62.0
Burstiness 79
Perplexity 45
Perplexity per line 48.5
Burstiness 58
Perplexity 16
Perplexity per line 25.0
Burstiness 45
Perplexity 32
Perplexity per line 77.33333333333333
Burstiness 145
