# Downloading and Transforming dataset

first we download datasets library

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiosig

## Downloading HC3 dataset (Hugging face) and M4 dataset (Github)

### HC3 dataset

- Downloading dataset
- saving it as a csv file

In [23]:
import os
import requests
import pandas as pd
import json
import pyarrow.parquet as pq
import pyarrow as pa
from io import BytesIO

# ============================================================
#   HC3 - T√©l√©charger le vrai split 'all/train' depuis HuggingFace
# ============================================================

# Original API URL to get parquet file metadata/URLs
api_url = "https://huggingface.co/api/datasets/Hello-SimpleAI/HC3/parquet/all/train"

print("üîç R√©cup√©ration de la liste des fichiers parquet...")
response_json = requests.get(api_url).json()

parquet_urls = []
# Robustly extract parquet URLs, accounting for potential variations in API response
if isinstance(response_json, list):
    if all(isinstance(item, dict) and 'url' in item for item in response_json):
        # Expected format: list of dicts, each with a 'url' key
        parquet_urls = [item["url"] for item in response_json]
    elif all(isinstance(item, str) for item in response_json):
        # Fallback: if it's a list of strings (raw URLs), use them directly
        print("‚ö†Ô∏è Warning: API returned a list of strings instead of dictionaries. Proceeding assuming these are direct URLs.")
        parquet_urls = response_json
    else:
        raise ValueError(f"Unexpected API response format: List contains mixed types or missing 'url' key. Response: {response_json}")
elif isinstance(response_json, dict) and 'url' in response_json:
    # If the response is a single dictionary with 'url'
    parquet_urls = [response_json['url']]
else:
    raise ValueError(f"API response is not a list or a dictionary with 'url'. Response: {response_json}")


print(f"üì¶ {len(parquet_urls)} fichiers trouv√©s.")

# Dossier de sortie
os.makedirs("HC3", exist_ok=True)

dfs = []

# T√©l√©charger et charger chaque parquet
for url in parquet_urls:
    print(f"‚¨áÔ∏è T√©l√©chargement : {url}")
    r = requests.get(url)
    r.raise_for_status()

    # Lire le contenu parquet directement en m√©moire
    table = pq.read_table(BytesIO(r.content))
    dfs.append(table.to_pandas())

print("üîó Fusion de tous les blocs parquet...")
df = pd.concat(dfs, ignore_index=True)

# Export final
csv_path = "/content/HC3/hc3_dataset.csv"
df.to_csv(csv_path, index=False)

print("\n‚úÖ Export termin√© !")
print("üìÅ Fichier :", csv_path)
print("üìä Nombre total de lignes :", len(df))
print("\nAper√ßu :")
df.head()

üîç R√©cup√©ration de la liste des fichiers parquet...
üì¶ 1 fichiers trouv√©s.
‚¨áÔ∏è T√©l√©chargement : https://huggingface.co/api/datasets/Hello-SimpleAI/HC3/parquet/all/train/0.parquet
üîó Fusion de tous les blocs parquet...

‚úÖ Export termin√© !
üìÅ Fichier : /content/HC3/hc3_dataset.csv
üìä Nombre total de lignes : 24322

Aper√ßu :


Unnamed: 0,id,question,human_answers,chatgpt_answers,source
0,0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...,reddit_eli5
1,1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...,reddit_eli5
2,2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...,reddit_eli5
3,3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...,reddit_eli5
4,4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...,reddit_eli5


### M4 dataset

- downloading dataset (.jsonl) from github
- transforming .jsonl to csv

In [25]:
import requests
import os

sources = ["arxiv", "reddit", "peerread", "wikihow"]

# Define the actual existing models for each source based on the GitHub repository
source_models = {
    "arxiv": ["davinci", "chatGPT", "cohere", "flant5", "dolly"],
    "reddit": ["davinci", "chatGPT", "cohere", "flant5", "dolly"],
    "peerread": ["davinci", "cohere", "dolly"],
    "wikihow": ["davinci", "chatGPT", "cohere"],
}

for src in sources:
  # Iterate only over models known to exist for the current source
  if src in source_models:
    for model in source_models[src]:

      url = f"https://raw.githubusercontent.com/mbzuai-nlp/M4/main/data/{src}_{model}.jsonl"

      # Create the M4 directory if it doesn't exist
      if not os.path.exists("M4"):
        os.makedirs("M4")

      response = requests.get(url)

      if response.status_code == 200:

          with open(f"./M4/{src}_{model}.jsonl", "wb") as file:
              file.write(response.content)
          print(f"Successfully downloaded: {src}_{model}.jsonl")
      else:

          print(f"Error status code {response.status_code}")
          print(f"url : {url}")
  else:
    print(f"No models defined for source: {src}. Skipping.")

Successfully downloaded: arxiv_davinci.jsonl
Successfully downloaded: arxiv_chatGPT.jsonl
Successfully downloaded: arxiv_cohere.jsonl
Successfully downloaded: arxiv_flant5.jsonl
Successfully downloaded: arxiv_dolly.jsonl
Successfully downloaded: reddit_davinci.jsonl
Successfully downloaded: reddit_chatGPT.jsonl
Successfully downloaded: reddit_cohere.jsonl
Successfully downloaded: reddit_flant5.jsonl
Successfully downloaded: reddit_dolly.jsonl
Successfully downloaded: peerread_davinci.jsonl
Successfully downloaded: peerread_cohere.jsonl
Successfully downloaded: peerread_dolly.jsonl
Successfully downloaded: wikihow_davinci.jsonl
Successfully downloaded: wikihow_chatGPT.jsonl
Successfully downloaded: wikihow_cohere.jsonl


In [26]:
import pandas as pd
import json

directory = '/content/M4'

for filename in os.listdir(directory):

  filename = os.path.join(directory, filename)

  if os.path.isfile(filename):

    data = []
    invalide_file = False

    with open(filename, 'r') as f:
        for line in f:
            try:
                # parsing line json and adding it to data
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print("skipping invalid json line file")
                print(filename,"\n")
                invalide_file = True
                break

    if not invalide_file:

      dataset = pd.DataFrame(data)

      # specifying the output dir and file name
      csv_filename = os.path.splitext(filename)[0] + '.csv'

      # saving csv
      dataset.to_csv(csv_filename, index=False)

      # removing the .jsonl file
      os.remove(filename)



skipping invalid json line file
/content/M4/arxiv_dolly.jsonl 

skipping invalid json line file
/content/M4/reddit_dolly.jsonl 



## First Human written text dataset

### Human written from GPT2-outputs

- downloading dataset
- saving it as a csv file

In [27]:
import os
import sys
import requests
from tqdm import tqdm

subdir = 'data'
if not os.path.exists(subdir):
    os.makedirs(subdir)
subdir = subdir.replace('\\','/')

for ds in [
    'webtext',

    # '''
    #   other files are for the LLM generated dataset who have been training (fine-tuned) on webtext dataset.
    #   no need to download them because we need LLM generated text that have not been fine tuned
    # '''

    # 'small-117M',  'small-117M-k40',
    # 'medium-345M', 'medium-345M-k40',
    # 'large-762M',  'large-762M-k40',
    # 'xl-1542M',    'xl-1542M-k40',
]:
    for split in ['train']:
        filename = ds + "." + split + '.jsonl'
        r = requests.get("https://openaipublic.azureedge.net/gpt-2/output-dataset/v1/" + filename, stream=True)

        with open(os.path.join(subdir, filename), 'wb') as f:
            file_size = int(r.headers["content-length"])
            chunk_size = 1000
            with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    pbar.update(chunk_size)

Fetching webtext.train.jsonl: 679Mit [01:28, 7.68Mit/s]                                             


Transforming the .jsonl dataset to pandas Dataframe

- selecting texts that have more than 50 words and less than 500 words
- selecting 20000 exemples
- finally transforming the dataframe to csv then downloading it

In [28]:
import pandas as pd
import json

# transforming jsonl to dataframe
data = []
with open('/content/data/webtext.train.jsonl', 'r') as f:
    for line in f:
        try:
            # parsing line json and adding it to data
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print("skipping invalid json line")

# dir to stock human text dataset
os.makedirs("Human-text-dataset", exist_ok=True)

dataset = pd.DataFrame(data)

# selecting just columns that have not been truncated
dataset = dataset[dataset['ended'] == True]


# selecting texts that have  50 < words < 500
sub_dataset = dataset[dataset["length"] < 500 ]

sub_dataset = sub_dataset[sub_dataset["length"] > 50 ]

# shape of the subdataset
print(sub_dataset.shape)

# selecting 35000 text from the dataset randomly
sub_dataset = sub_dataset.sample(n=25000, random_state=42)

# adding label
sub_dataset["label"] = "human"

# droping unecessary columns
drop_columns = [col for col in sub_dataset.columns if col not in {'text','label'}]
sub_dataset.drop(columns=drop_columns, inplace=True)


print(sub_dataset.shape)

# transforming the dataset to csv file
sub_dataset.to_csv("Human-text-dataset/GPT2-outputs-human-written-dataset.csv")

(113557, 4)
(25000, 2)


### Human written from HC3


 - changing dataset structure to match other datasets
 - deleting unecessary columns
 - Downloading some of the humain answers from the HC3 dataset

In [29]:
data = pd.read_csv('/content/HC3/hc3_dataset.csv')

'''
    dataset will have a structure like this :
   text, label
   text : the text itself
   label : the label of the text (human or ai)

'''

# deleting the [ ] from human_answers
data['human_answers'] = data['human_answers'].str.replace(r'[\[\]]', '', regex=True)


# getting the length of the human_answers
data["length"] = data['human_answers'].apply(lambda x: len(x.split()))


# selecting texts that have  50 < words < 500
sub_dataset = data[data["length"] < 500 ]

sub_dataset = sub_dataset[sub_dataset["length"] > 50 ]


data = sub_dataset

# droping unecessary columns
data.drop(columns = ["source","chatgpt_answers","question","id","length"], inplace=True)


# adding label 'human'
data["label"] = "human"

# renaming columns
data.rename(columns = { 'human_answers' : 'text'}, inplace=True)


print("shape of dataset: ",data.shape)

# exporting dataset as a csv file
data.to_csv("Human-text-dataset/HC3-Human-written-dataset.csv")


shape of dataset:  (17784, 2)


### Human written from M4


 - changing dataset structure to match other datasets
 - deleting unecessary columns
 - Downloading some of the humain answers from the M4 dataset

In [30]:
# human written text
'''
        dataset will have a structure like this :
      , text, label
      text : the text itself
      label : the label of the text (human or ai)

'''

directory = '/content/M4'
output_directory = '/content/Human-text-dataset'


#create output dir
os.makedirs(output_directory, exist_ok=True)

# create an empty dataframe to store all the values
all_data = pd.DataFrame()

required_columns = {'human_text'}

for filename in os.listdir(directory):

  if filename.endswith('.csv'): # processing only csv files

    filepath = os.path.join(directory, filename)

    print(f"Processing file: {filename}")

    data = pd.read_csv(filepath)

    if not required_columns.issubset(data.columns):
      print(f"The file {filename} does not contain the required column 'human_text'. Skipping.")
      continue

    # getting the length of the human_text
    try:
      data["length"] = data['human_text'].apply(lambda x: len(x.split()))
    except Exception as e:

      print(f"the file {filename} was not processed")
      continue

    # selecting texts that have  50 < words < 500
    sub_dataset = data[data["length"] < 500 ]

    sub_dataset = sub_dataset[sub_dataset["length"] > 50 ]


    # droping unecessary columns
    drop_columns = [col for col in data.columns if col not in {'human_text'}]
    sub_dataset.drop(columns=drop_columns, inplace=True)

    # assign new dataset to data variable
    data = sub_dataset

    # adding label 'human'
    data["label"] = "human"

    # renaming columns
    data.rename(columns = { 'human_text' : 'text'}, inplace=True)

    # concatinating data to forme a global dataset
    all_data = pd.concat([all_data, data], ignore_index=True)



output_filepath = os.path.join(output_directory, "M4-Human-written-dataset.csv")

print("shape of dataset: ",all_data.shape)

all_data.to_csv(output_filepath, index=False)


count = all_data.duplicated().sum()
print(f"Number of duplicate rows: {count}")

Processing file: peerread_cohere.csv
Processing file: peerread_chatgpt.csv
Processing file: arxiv_davinci.csv
Processing file: arxiv_chatGPT.csv
Processing file: reddit_davinci.csv
Processing file: wikihow_davinci.csv
Processing file: peerread_davinci.csv
Processing file: wikihow_cohere.csv
the file wikihow_cohere.csv was not processed
Processing file: reddit_cohere.csv
Processing file: arxiv_cohere.csv
Processing file: reddit_flant5.csv
Processing file: arxiv_flant5.csv
Processing file: wikihow_chatGPT.csv
Processing file: peerread_dolly.csv
Processing file: reddit_chatGPT.csv
shape of dataset:  (22796, 2)
Number of duplicate rows: 16521


## Second LLM generated dataset

### Ai generated text from HC3

- changing dataset structure to match other datasets
- deleting unecessary columns

In [31]:
data = pd.read_csv('/content/HC3/hc3_dataset.csv')

# '''
#     dataset will have a structure like this :
#    , text, label
#    text : the text itself
#    label : the label of the text (human or ai)

# '''

# deleting the [ ] from chatgpt_answers
data['chatgpt_answers'] = data['chatgpt_answers'].str.replace(r'[\[\]]', '', regex=True)


# getting the length of the chatgpt_answers
data["length"] = data['chatgpt_answers'].apply(lambda x: len(x.split()))


# selecting texts that have  50 < words < 500
sub_dataset = data[data["length"] < 500 ]

sub_dataset = sub_dataset[sub_dataset["length"] > 50 ]


data = sub_dataset

# droping unecessary columns
data.drop(columns = ["source","human_answers","question","id","length"], inplace=True)


# adding label 'Ai'
data["label"] = "Ai"

# renaming columns
data.rename(columns = { 'chatgpt_answers' : 'text'}, inplace=True)


print("shape of dataset: ",data.shape)

data.head()


os.makedirs("Ai-generated-text-dataset", exist_ok=True)

# exporting dataset as a csv file
sub_dataset.to_csv("Ai-generated-text-dataset/HC3-Ai-generated-dataset.csv")

data.head()



shape of dataset:  (23342, 2)


Unnamed: 0,text,label
0,'There are many different best seller lists th...,Ai
1,"""Salt is used on roads to help melt ice and sn...",Ai
2,"""There are a few reasons why we still have SD ...",Ai
3,'It is generally not acceptable or ethical to ...,Ai
4,'After the Wright Brothers made the first powe...,Ai


### Ai generated from M4

- changing dataset structure to match other datasets
- deleting unecessary columns

In [32]:
from logging import exception
# AI written text

# '''
#         dataset will have a structure like this :
#       , text, label
#       text : the text itself
#       label : the label of the text (human or ai)

# '''

directory = '/content/M4'
output_directory = '/content/Ai-generated-text-dataset'

os.makedirs(output_directory, exist_ok=True)

all_data = pd.DataFrame()

required_columns = {'machine_text'}

for filename in os.listdir(directory):

  if filename.endswith('.csv'):

    filepath = os.path.join(directory, filename)

    print(f"Processing file: {filename}")

    data = pd.read_csv(filepath)

    if not required_columns.issubset(data.columns):
      print(f"The file {filename} does not contain the required column 'human_text'. Skipping.")
      continue

    # getting the length of the machine_text
    try :
      data["length"] = data['machine_text'].apply(lambda x: len(x.split()))

    except Exception as e:

      print(f"the file {filename} was not processed")
      continue


    # selecting texts that have  50 < words < 500
    sub_dataset = data[data["length"] < 500 ]

    sub_dataset = sub_dataset[sub_dataset["length"] > 40 ]

    # assign new dataset to data variable
    data = sub_dataset

    # droping unecessary columns
    drop_columns = [col for col in data.columns if col not in {'machine_text'}]
    data.drop(columns=drop_columns, inplace=True)


    # adding label 'Ai'
    data["label"] = "Ai"

    # renaming columns
    data.rename(columns = { 'machine_text' : 'text'}, inplace=True)

    all_data = pd.concat([all_data, data], ignore_index=True)



output_filepath = os.path.join(output_directory, "M4-Ai-generated-dataset.csv")

print("shape of dataset: ",all_data.shape)

all_data.to_csv(output_filepath, index=False)


Processing file: peerread_cohere.csv
Processing file: peerread_chatgpt.csv
Processing file: arxiv_davinci.csv
Processing file: arxiv_chatGPT.csv
Processing file: reddit_davinci.csv
Processing file: wikihow_davinci.csv
Processing file: peerread_davinci.csv
Processing file: wikihow_cohere.csv
Processing file: reddit_cohere.csv
the file reddit_cohere.csv was not processed
Processing file: arxiv_cohere.csv
Processing file: reddit_flant5.csv
Processing file: arxiv_flant5.csv
Processing file: wikihow_chatGPT.csv
Processing file: peerread_dolly.csv
Processing file: reddit_chatGPT.csv
shape of dataset:  (24973, 2)


## Combaining both datasets

In [33]:
import pandas as pd

# human dataset

df1 = pd.read_csv('/content/Human-text-dataset/HC3-Human-written-dataset.csv')
df2 = pd.read_csv('/content/Human-text-dataset/M4-Human-written-dataset.csv')
df3 = pd.read_csv('/content/Human-text-dataset/GPT2-outputs-human-written-dataset.csv')


# droping unnamed : 0 columns for df1 and df3
df1.drop(columns = ["Unnamed: 0"], inplace=True)
df3.drop(columns = ["Unnamed: 0"], inplace=True)

df_all_human = pd.concat([df1, df2, df3], ignore_index=True)


# Ai dataset

df4 = pd.read_csv('/content/Ai-generated-text-dataset/HC3-Ai-generated-dataset.csv')
df5 = pd.read_csv('/content/Ai-generated-text-dataset/M4-Ai-generated-dataset.csv')

df4.drop(columns = ["Unnamed: 0"], inplace=True)


df_all_Ai = pd.concat([df4, df5], ignore_index=True)

# print("the shape of human dataset is : ", df_all_human.shape)
print("the shape of Ai generated text dataest is : ", df_all_Ai.shape)


# droping duplicates to see if there is a balance between classes
df_all_human = df_all_human.drop_duplicates()
df_all_Ai = df_all_Ai.drop_duplicates()

print("The shape of human dataset after removing duplicates is:", df_all_human.shape)
print("The shape of AI generated text dataset after removing duplicates is:", df_all_Ai.shape)

the shape of Ai generated text dataest is :  (48315, 2)
The shape of human dataset after removing duplicates is: (47599, 2)
The shape of AI generated text dataset after removing duplicates is: (47796, 2)


## Downloading datasets

In [34]:
import shutil
from google.colab import files


os.makedirs("datasets", exist_ok=True)

df_all_human.to_csv('datasets/human_dataset.csv', index=False)
df_all_Ai.to_csv('datasets/Ai_dataset.csv', index=False)

dir = '/content/datasets'

zip = '/content/datasets.zip'

shutil.make_archive(zip.replace('.zip', ''), 'zip', dir)

print(f"Directories have been zipped: {zip}")

files.download(zip)

Directories have been zipped: /content/datasets.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>