In [3]:
import json

def extract_text_from_json(data):
    texts = []
    if "question" in data:
        texts.append(data["question"])
    if "title" in data:
        texts.append(data["title"])
    if "content" in data and data["content"]:
        texts.append(data["content"])
    if "comments" in data:
        for comment in data["comments"]:
            if isinstance(comment, dict) and "text" in comment:
                texts.append(comment["text"])
            elif isinstance(comment, str):
                texts.append(comment)
    return texts

def process_json_files(file_paths):
    all_texts = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                if isinstance(data, list):
                    for item in data:
                        all_texts.extend(extract_text_from_json(item))
                else:
                    all_texts.extend(extract_text_from_json(data))
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    return all_texts

# Usage
if __name__ == "__main__":
    json_files = ['merged_data.json', 'output_updated.json']  # Replace with your file paths
    
    # Extract texts
    texts = process_json_files(json_files)
    
    # Print results
    for i, text in enumerate(texts, 1):
        print(f"\n--- Text {i} ---")
        print(text.strip())
    
    # Save to file
    with open('extracted_texts.txt', 'w', encoding='utf-8') as f:
        for text in texts:
            f.write(text.strip() + '\n\n')



--- Text 1 ---


--- Text 2 ---
از کجا باید شروع کنم؟ آیا می‌توانم به دلیل  ضرب و شتم طلاق بگیرم؟

--- Text 3 ---
یکسال پیش هیچ امیدی نداشتم، همه روش ها رو امتحان کردم تا اینکه بعد از کلی درد کشیدن از طریق ویزیت آنلاین وکاملاً رایگانبا تیم دکتر گلشنی آشنا شدم. خودم قوزپشتی و کمردرد داشتم و دختر بزرگم پای پرانتزی و کف پای صاف داشت که همه شون کاملاً آنلاین با کمک متخصص برطرف شد.
اگر خودتون یا اطرافیانتون دردهایی در زانو، گردن یا کمر دارید یاحتی ناهنجاری هایی مثل گودی کمر و  پای ضربدری داریدقبل از هر کاری با زدن روی این لینک یه نوبت ویزیت 100% رایگان و آنلاین از متخصص دریافت کنید.

--- Text 4 ---
خانوما بنظرتون این فرش قشنگه میخوام بخرمش

--- Text 5 ---
این فرش خوبه

--- Text 6 ---
بزار

--- Text 7 ---
یکسال پیش هیچ امیدی نداشتم، همه روش ها رو امتحان کردم تا اینکه بعد از کلی درد کشیدن از طریق ویزیت آنلاین وکاملاً رایگانبا تیم دکتر گلشنی آشنا شدم. خودم قوزپشتی و کمردرد داشتم و دختر بزرگم پای پرانتزی و کف پای صاف داشت که همه شون کاملاً آنلاین با کمک متخصص برطرف شد.
اگر خودتون یا اطرافیانتو

In [4]:
!pip install datasets transformers


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64

In [5]:
from datasets import load_dataset

def combine_with_huggingface():
    # Load existing texts from file
    with open('extracted_texts.txt', 'r', encoding='utf-8') as f:
        existing_texts = f.read().split('\n\n')
        # Remove empty strings
        existing_texts = [text.strip() for text in existing_texts if text.strip()]
    
    # Load Hugging Face dataset
    dataset = load_dataset("mshojaei77/PersianTelegramChannels", split="train")
    hf_texts = dataset["text"]
    
    # Combine texts
    all_texts = existing_texts + list(hf_texts)
    
    # Save combined texts
    with open('combined_texts.txt', 'w', encoding='utf-8') as f:
        for text in all_texts:
            f.write(text + '\n\n')
    
    # Print statistics
    print(f"Number of existing texts: {len(existing_texts)}")
    print(f"Number of Hugging Face texts: {len(hf_texts)}")
    print(f"Total number of combined texts: {len(all_texts)}")
    
    return all_texts

if __name__ == "__main__":
    combined_texts = combine_with_huggingface()
    
    # Print sample of combined texts
    print("\nSample of combined texts:")
    for i, text in enumerate(combined_texts[:3], 1):
        print(f"\n--- Text {i} ---")
        print(text[:200] + "..." if len(text) > 200 else text)


README.md:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/9.46M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60730 [00:00<?, ? examples/s]

Number of existing texts: 10692
Number of Hugging Face texts: 60730
Total number of combined texts: 71422

Sample of combined texts:

--- Text 1 ---
از کجا باید شروع کنم؟ آیا می‌توانم به دلیل  ضرب و شتم طلاق بگیرم؟

--- Text 2 ---
یکسال پیش هیچ امیدی نداشتم، همه روش ها رو امتحان کردم تا اینکه بعد از کلی درد کشیدن از طریق ویزیت آنلاین وکاملاً رایگانبا تیم دکتر گلشنی آشنا شدم. خودم قوزپشتی و کمردرد داشتم و دختر بزرگم پای پرانتزی ...

--- Text 3 ---
خانوما بنظرتون این فرش قشنگه میخوام بخرمش


In [6]:
from datasets import load_dataset
import time

def load_and_extract_text(dataset_name, text_key=None, split="train"):
    """Load dataset and extract texts based on the dataset structure"""
    try:
        print(f"Loading dataset: {dataset_name}")
        ds = load_dataset(dataset_name, split=split)
        texts = []
        
        # If text_key is provided, use it directly
        if text_key:
            texts = [item[text_key] for item in ds if text_key in item]
            return texts
        
        # Try common text field names
        common_keys = ['text', 'content', 'question', 'answer', 'input', 'output', 'premise', 'hypothesis']
        
        for item in ds:
            for key in common_keys:
                if key in item and isinstance(item[key], str):
                    texts.append(item[key])
            
            # Special handling for nested structures
            if 'translation' in item:
                if isinstance(item['translation'], dict) and 'fa' in item['translation']:
                    texts.append(item['translation']['fa'])
        
        return texts
    except Exception as e:
        print(f"Error loading {dataset_name}: {str(e)}")
        return []

def combine_all_datasets():
    # Dictionary of datasets with their text field keys
    datasets_config = {
        "MaralGPT/persian_blogs": "text",
        "MaralGPT/persian_quotes": "text",
        "taesiri/TinyStories-Farsi": "text",
        "community-datasets/farsi_news": "text",
        "alighasemi/farsi_paraphrase_detection": "text1",  # Also has text2
        "sinap/FarsiTinyStories": "text",
        "ZharfaTech/ZharfaTech-Open-Platypus-Persian-Farsi": "text",
        "ZharfaTech/ZharfaTech-OpenAssistant-Guanaco-Persian-Farsi": "text",
        "RohanAiLab/persian_daily_news": "text",
        "RohanAiLab/persian_blog": "text",
        "SajjadAyoubi/persian_qa": "question",  # Also has answer
        "persiannlp/parsinlu_entailment": "premise",  # Also has hypothesis
        "mshojaei77/PersianTelegramChannels": "text"
    }

    # Load existing texts
    try:
        with open('extracted_texts.txt', 'r', encoding='utf-8') as f:
            all_texts = f.read().split('\n\n')
            all_texts = [text.strip() for text in all_texts if text.strip()]
        print(f"Loaded {len(all_texts)} existing texts")
    except FileNotFoundError:
        all_texts = []
        print("No existing texts file found, starting fresh")

    # Load and combine texts from all datasets
    for dataset_name, text_key in datasets_config.items():
        start_time = time.time()
        new_texts = load_and_extract_text(dataset_name, text_key)
        
        if new_texts:
            all_texts.extend(new_texts)
            print(f"Added {len(new_texts)} texts from {dataset_name}")
            print(f"Time taken: {time.time() - start_time:.2f} seconds")

    # Remove duplicates while preserving order
    all_texts = list(dict.fromkeys(all_texts))

    # Save combined texts
    with open('combined_texts.txt', 'w', encoding='utf-8') as f:
        for text in all_texts:
            f.write(text.strip() + '\n\n')

    return all_texts

if __name__ == "__main__":
    print("Starting dataset combination process...")
    combined_texts = combine_all_datasets()
    
    # Print statistics
    print("\nFinal Statistics:")
    print(f"Total number of texts: {len(combined_texts)}")
    
    # Print sample
    print("\nSample of combined texts:")
    for i, text in enumerate(combined_texts[:3], 1):
        print(f"\n--- Text {i} ---")
        print(text[:200] + "..." if len(text) > 200 else text)


Starting dataset combination process...
Loaded 10692 existing texts
Loading dataset: MaralGPT/persian_blogs


dataset.jsonl:   0%|          | 0.00/6.16G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/385900 [00:00<?, ? examples/s]

Added 385900 texts from MaralGPT/persian_blogs
Time taken: 63.69 seconds
Loading dataset: MaralGPT/persian_quotes


README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

data.json:   0%|          | 0.00/329k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/739 [00:00<?, ? examples/s]

Loading dataset: taesiri/TinyStories-Farsi


README.md:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

(…)ain-Translated-To-Farsi-w-Claude-2.0.csv:   0%|          | 0.00/271M [00:00<?, ?B/s]

(…)lid-Translated-To-Farsi-w-Claude-2.0.csv:   0%|          | 0.00/59.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/124411 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/27630 [00:00<?, ? examples/s]

Loading dataset: community-datasets/farsi_news


README.md:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

hamshahri-00000-of-00001.parquet:   0%|          | 0.00/591k [00:00<?, ?B/s]

radiofarda-00000-of-00001.parquet:   0%|          | 0.00/118k [00:00<?, ?B/s]

Generating hamshahri split:   0%|          | 0/2203 [00:00<?, ? examples/s]

Generating radiofarda split:   0%|          | 0/284 [00:00<?, ? examples/s]

Error loading community-datasets/farsi_news: Unknown split "train". Should be one of ['hamshahri', 'radiofarda'].
Loading dataset: alighasemi/farsi_paraphrase_detection


README.md:   0%|          | 0.00/693 [00:00<?, ?B/s]

(…)-00000-of-00001-68fca37d0cee4089.parquet:   0%|          | 0.00/366k [00:00<?, ?B/s]

(…)-00000-of-00001-db766adc01340b9b.parquet:   0%|          | 0.00/48.3k [00:00<?, ?B/s]

(…)-00000-of-00001-6d3aa0a232d76302.parquet:   0%|          | 0.00/48.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6260 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/783 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/783 [00:00<?, ? examples/s]

Loading dataset: sinap/FarsiTinyStories


README.md:   0%|          | 0.00/320 [00:00<?, ?B/s]

train-00000-of-00011.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00001-of-00011.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00002-of-00011.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00003-of-00011.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00004-of-00011.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00005-of-00011.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00006-of-00011.parquet:   0%|          | 0.00/213M [00:00<?, ?B/s]

train-00007-of-00011.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00008-of-00011.parquet:   0%|          | 0.00/213M [00:00<?, ?B/s]

train-00009-of-00011.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

train-00010-of-00011.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Added 2119719 texts from sinap/FarsiTinyStories
Time taken: 133.92 seconds
Loading dataset: ZharfaTech/ZharfaTech-Open-Platypus-Persian-Farsi


README.md:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

Open-Platypus_Persian.parquet:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24926 [00:00<?, ? examples/s]

Loading dataset: ZharfaTech/ZharfaTech-OpenAssistant-Guanaco-Persian-Farsi


README.md:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

(…)tant-guanaco-persian-instruct-fa.parquet:   0%|          | 0.00/8.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9769 [00:00<?, ? examples/s]

Loading dataset: RohanAiLab/persian_daily_news


README.md:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

persian_daily_news.py:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Error loading RohanAiLab/persian_daily_news: Not enough disk space. Needed: 8.76 GiB (download: 1.74 GiB, generated: 7.02 GiB, post-processed: Unknown size)
Loading dataset: RohanAiLab/persian_blog


README.md:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

persian_blog.py:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

0001.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

0002.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

0003.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

0004.parquet:   0%|          | 0.00/114M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/385900 [00:00<?, ? examples/s]

Added 385900 texts from RohanAiLab/persian_blog
Time taken: 29.81 seconds
Loading dataset: SajjadAyoubi/persian_qa


README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

persian_qa.py:   0%|          | 0.00/4.62k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

persian_qa/validation/0000.parquet:   0%|          | 0.00/165k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9008 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/930 [00:00<?, ? examples/s]

Added 9008 texts from SajjadAyoubi/persian_qa
Time taken: 2.18 seconds
Loading dataset: persiannlp/parsinlu_entailment


README.md:   0%|          | 0.00/4.81k [00:00<?, ?B/s]

parsinlu_entailment.py:   0%|          | 0.00/4.93k [00:00<?, ?B/s]

parsinlu-repo/train/0000.parquet:   0%|          | 0.00/136k [00:00<?, ?B/s]

parsinlu-repo/test/0000.parquet:   0%|          | 0.00/302k [00:00<?, ?B/s]

parsinlu-repo/validation/0000.parquet:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/755 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1675 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/270 [00:00<?, ? examples/s]

Loading dataset: mshojaei77/PersianTelegramChannels
Added 60730 texts from mshojaei77/PersianTelegramChannels
Time taken: 1.81 seconds

Final Statistics:
Total number of texts: 2262655

Sample of combined texts:

--- Text 1 ---
از کجا باید شروع کنم؟ آیا می‌توانم به دلیل  ضرب و شتم طلاق بگیرم؟

--- Text 2 ---
یکسال پیش هیچ امیدی نداشتم، همه روش ها رو امتحان کردم تا اینکه بعد از کلی درد کشیدن از طریق ویزیت آنلاین وکاملاً رایگانبا تیم دکتر گلشنی آشنا شدم. خودم قوزپشتی و کمردرد داشتم و دختر بزرگم پای پرانتزی ...

--- Text 3 ---
خانوما بنظرتون این فرش قشنگه میخوام بخرمش


In [7]:
import os

file_path = 'combined_texts.txt'
size_mb = os.path.getsize(file_path) / (1024 * 1024)
print(f"File size: {size_mb:.2f} MB")


File size: 3693.21 MB
