https://github.com/ahans30/Binoculars

In [None]:
!git clone https://github.com/ahans30/Binoculars.git
%cd Binoculars
!pip install -e .
!pip install datasets

In [None]:
from binoculars import Binoculars

bino = Binoculars()

# ChatGPT (GPT-4) output when prompted with “Can you write a few sentences about a capybara that is an astrophysicist?"
sample_string = '''Dr. Capy Cosmos, a capybara unlike any other, astounded the scientific community with his
groundbreaking research in astrophysics. With his keen sense of observation and unparalleled ability to interpret
cosmic data, he uncovered new insights into the mysteries of black holes and the origins of the universe. As he
peered through telescopes with his large, round eyes, fellow researchers often remarked that it seemed as if the
stars themselves whispered their secrets directly to him. Dr. Cosmos not only became a beacon of inspiration to
aspiring scientists but also proved that intellect and innovation can be found in the most unexpected of creatures.'''

print(bino.compute_score(sample_string))  # 0.75661373
print(bino.predict(sample_string))  # 'Most likely AI-Generated'

In [None]:
temp = "ok is this working"
print(bino.compute_score(temp))  # 1.1117020845413208
print(bino.predict(temp))  # Most likely human-generated

In [None]:
import os
from datasets import load_dataset, load_from_disk

SAVE_PATH = '/content/data/wiki_dataset'

def get_wikipedia_dataset():
    # Check if dataset already exists on disk
    if os.path.exists(SAVE_PATH):
        print("Loading dataset from disk...")
        dataset = load_from_disk(SAVE_PATH)
    else:
        print("Dataset not found on disk. Downloading from Hugging Face...")
        # Create directory if it doesn't exist
        os.makedirs('/content/data', exist_ok=True)

        # Download and load the dataset
        dataset = load_dataset("wikipedia", "20220301.en")

        # Save to disk
        print("Saving dataset to disk...")
        dataset.save_to_disk(SAVE_PATH)

    print("Dataset is ready!")
    return dataset

In [None]:
legacy_dataset = get_wikipedia_dataset()


In [None]:
import json
from datasets import Dataset, DatasetDict

with open("/content/scraped_wiki_articles_2_2025.json", "r", encoding="utf-8") as file:
    data = json.load(file)

for entry in data:
    entry["text"] = entry.pop("content")

dataset = Dataset.from_list(data)

new_articles_dataset = DatasetDict({"train": dataset})

print(new_articles_dataset)
print(legacy_dataset)


In [None]:
with open("/content/scraped_wiki_articles_8_2023.json", "r", encoding="utf-8") as file:
    data = json.load(file)

for entry in data:
    entry["text"] = entry.pop("content")

dataset = Dataset.from_list(data)

august_2023_dataset = DatasetDict({"train": dataset})
print(august_2023_dataset)

In [9]:
import json
import re

all_scores = []
non_ai = 0
ai = 0
def ai_score(lines, truncate_to=5000):
    print(f"number of lines: {len(lines)}")
    global non_ai, ai
    scores = []
    for ind, line in enumerate(lines):
        print(len(line))
        score = bino.compute_score(' '.join(line.strip().split()[:truncate_to]))
        print(score, ind)
        scores.append((score, ind))

        prediction = bino.predict(' '.join(line.strip().split()[:truncate_to]))
        print(prediction)
        if "AI" in prediction:
            ai += 1
        else:
            non_ai += 1

    return scores

def clean_text(text):
    # Replace all newlines with a unique string
    text = text.replace('\n', '!@#').replace('\t', ' ')
    # Split the text by the unique string
    split_text = text.split('!@#')
    # Remove elements with 5 or fewer words
    split_text = [segment for segment in split_text if len(segment.split()) > 5]
    # Rejoin the text
    text = ' '.join(split_text)
    # Remove "References" and "External links" sections
    text = re.sub(r'== References ==.*|== External links ==.*', '', text, flags=re.DOTALL)

    return text.strip()

def extract_title_text_and_url_legacy(dataset):
    titles, texts, urls = [], [], []

    for i in range(850):

      # common crawl has empty title and url
      title = dataset["train"][i]["title"]
      content = dataset["train"][i]["text"]
      url = dataset["train"][i]["url"]
      cleaned_text = clean_text(content)

      if title and cleaned_text:
          titles.append(title)
          texts.append(cleaned_text)
          urls.append(url)

    return titles, texts, urls

def process_files(dataset):
    titles, texts, urls = extract_title_text_and_url_legacy(dataset)
    texts = [' '.join(text.strip().split()[:400]) for text in texts]
    # final_texts, final_titles, final_urls = [],[],[]
    # for ind, text in enumerate(texts):
    #     if len(text.split()) >= 100:
    #         final_texts.append(texts[ind])
    #         final_titles.append(titles[ind])
    #         final_urls.append(urls[ind])

    print(f"number of texts: {len(texts)}")

    bino_scores = ai_score(texts)
    all_scores.append(bino_scores)

    # with open(output_file, 'w', encoding='utf-8') as outfile:
    #     for title, score, url, text in zip(final_titles, bino_scores, final_urls, final_texts):
    #         outfile.write(f"{title}, {score[0]}\n")


In [None]:
output_file = "/content/legacy_wikipedia.json"
input_file = ""

**Legacy Articles**

In [None]:
process_files(legacy_dataset)

In [None]:
print(all_scores)
print(ai)
print(non_ai)

In [None]:
import numpy as np

score_nums = []

for score, _ in all_scores[0]:
  # print(score)
  score_nums.append(score)

mean = np.mean(score_nums)
median = np.median(score_nums)
std = np.std(score_nums)

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Std dev: {std}")




**New Articles**

In [None]:
process_files(new_articles_dataset)
print(all_scores)
print(ai)
print(non_ai)

In [None]:
score_nums = []

for score, _ in all_scores[0]:
  # print(score)
  score_nums.append(score)

mean = np.mean(score_nums)
median = np.median(score_nums)
std = np.std(score_nums)

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Std dev: {std}")

August 2023 Articles

In [None]:
process_files(august_2023_dataset)

In [None]:
print(all_scores)
print(ai)
print(non_ai)

August 2024 Articles

In [None]:
import json
from datasets import Dataset, DatasetDict

with open("/content/scraped_wiki_articles_8_2024.json", "r", encoding="utf-8") as file:
    data = json.load(file)

for entry in data:
    entry["text"] = entry.pop("content")

dataset = Dataset.from_list(data)

august_2024_dataset = DatasetDict({"train": dataset})
print(august_2024_dataset)

In [None]:
process_files(august_2024_dataset)

In [None]:
print(all_scores)
print(ai)
print(non_ai)

Common Crawl May 2022

In [None]:
import json
from datasets import Dataset, DatasetDict

with open("/content/CC-MAIN-2022-05.json", "r", encoding="utf-8") as file:
    data = json.load(file)

sites_data = [{"text": site.pop("text"), "title": "temp", "url": ""} for site in data["sites"]]

dataset = Dataset.from_list(sites_data)

cc_2022_dict = DatasetDict({"train": dataset})

print(cc_2022_dict)

In [None]:
process_files(cc_2022_dict)

In [None]:
print(all_scores)
print(ai)
print(non_ai)

Common Crawl March 2023

In [None]:
import json
from datasets import Dataset, DatasetDict

with open("/content/CC-MAIN-2023-40.json", "r", encoding="utf-8") as file:
    data = json.load(file)

sites_data = [{"text": site.pop("text"), "title": "temp", "url": ""} for site in data["sites"]]

dataset = Dataset.from_list(sites_data)

cc_2023_dict = DatasetDict({"train": dataset})

print(cc_2023_dict)

In [None]:
process_files(cc_2023_dict)

In [None]:
print(all_scores)
print(ai)
print(non_ai)

Common Crawl March 2024

In [None]:
import json
from datasets import Dataset, DatasetDict

with open("/content/CC-MAIN-2024-33.json", "r", encoding="utf-8") as file:
    data = json.load(file)

sites_data = [{"text": site.pop("text"), "title": "temp", "url": ""} for site in data["sites"]]

dataset = Dataset.from_list(sites_data)

cc_2024_dict = DatasetDict({"train": dataset})

print(cc_2024_dict)

In [None]:
process_files(cc_2024_dict)

In [None]:
print(all_scores)
print(ai)
print(non_ai)

Common Crawl 2025

In [None]:
import json
from datasets import Dataset, DatasetDict

with open("/content/CC-MAIN-2025-08.json", "r", encoding="utf-8") as file:
    data = json.load(file)

sites_data = [{"text": site.pop("text"), "title": "temp", "url": ""} for site in data["sites"]]

dataset = Dataset.from_list(sites_data)

cc_2025_dict = DatasetDict({"train": dataset})

print(cc_2025_dict)

In [None]:
process_files(cc_2025_dict)

In [None]:
print(all_scores)
print(ai)
print(non_ai)