In [1]:
import os
import shutil
import json


In [2]:
# Root directory that contains subfolders
ROOT_DIR = "E:\\YuyangGPT\\dataset\\raw_data\\Messages"

# Where all copied files will go
OUTPUT_DIR = "E:\\YuyangGPT\\dataset\\cleaned_data\\cleaned_messages"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [3]:
for subfolder_name in os.listdir(ROOT_DIR):
    subfolder_path = os.path.join(ROOT_DIR, subfolder_name)

    # Skip anything that isn't a directory
    if not os.path.isdir(subfolder_path):
        continue

    # Look for messages.json inside the subfolder
    source_file = os.path.join(subfolder_path, "messages.json")

    if os.path.isfile(source_file):
        # New filename = subfolder name
        dest_file = os.path.join(
            OUTPUT_DIR,
            f"{subfolder_name}.json"
        )

        shutil.copy2(source_file, dest_file)

        print(f"Copied: {source_file} → {dest_file}")


Copied: E:\YuyangGPT\dataset\raw_data\Messages\c1003781145479098388\messages.json → E:\YuyangGPT\dataset\cleaned_data\cleaned_messages\c1003781145479098388.json
Copied: E:\YuyangGPT\dataset\raw_data\Messages\c1003807038939353128\messages.json → E:\YuyangGPT\dataset\cleaned_data\cleaned_messages\c1003807038939353128.json
Copied: E:\YuyangGPT\dataset\raw_data\Messages\c1004929763493822494\messages.json → E:\YuyangGPT\dataset\cleaned_data\cleaned_messages\c1004929763493822494.json
Copied: E:\YuyangGPT\dataset\raw_data\Messages\c1010755173737570345\messages.json → E:\YuyangGPT\dataset\cleaned_data\cleaned_messages\c1010755173737570345.json
Copied: E:\YuyangGPT\dataset\raw_data\Messages\c1011054438871478272\messages.json → E:\YuyangGPT\dataset\cleaned_data\cleaned_messages\c1011054438871478272.json
Copied: E:\YuyangGPT\dataset\raw_data\Messages\c1012925191761969152\messages.json → E:\YuyangGPT\dataset\cleaned_data\cleaned_messages\c1012925191761969152.json
Copied: E:\YuyangGPT\dataset\raw_d

In [6]:
# Root directory that contains subfolders
INPUT_DIR = "E:\\YuyangGPT\\dataset\\cleaned_data\\cleaned_messages"

# Where all copied files will go
OUTPUT_JSONL = "E:\\YuyangGPT\\dataset\\cleaned_data\\cleaned_discord_messages.jsonl"


In [7]:
def extract_contents_only(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not isinstance(data, list):
        return ""

    texts = []

    for msg in data:
        if not isinstance(msg, dict):
            continue

        contents = msg.get("Contents")
        if isinstance(contents, str) and contents.strip():
            texts.append(contents)

    return " ".join(texts)


In [8]:
with open(OUTPUT_JSONL, "w", encoding="utf-8") as out:
    for filename in os.listdir(INPUT_DIR):
        file_path = os.path.join(INPUT_DIR, filename)

        if not os.path.isfile(file_path):
            continue

        try:
            text = extract_contents_only(file_path)
        except Exception as e:
            print(f"⚠️ Skipped {filename}: {e}")
            continue

        text = " ".join(text.split())  # normalize whitespace

        if not text:
            continue

        out.write(
            json.dumps({"text": text}, ensure_ascii=False) + "\n"
        )

print("✅ JSONL file created:", OUTPUT_JSONL)


✅ JSONL file created: E:\YuyangGPT\dataset\cleaned_data\cleaned_discord_messages.jsonl
