In [None]:
metadata = {}
data = {'1': '1'}
metadata | data | {'11': '11'}

In [None]:
from datatrove.executor.local import LocalPipelineExecutor
from datatrove.pipeline.dedup import MinhashDedupSignature
from datatrove.pipeline.dedup.minhash import (
    MinhashConfig,
    MinhashDedupBuckets,
    MinhashDedupCluster,
    MinhashDedupFilter,
)
from datatrove.pipeline.readers import HuggingFaceDatasetReader
from datatrove.pipeline.writers.jsonl import JsonlWriter
from datatrove.pipeline.writers.huggingface import HuggingFaceDatasetWriter
from datatrove.pipeline.tokens import TokensCounter
from datatrove.utils.hashing import HashConfig

from eeve.data.utils import _reader_adapter_with_column_info, _writer_adapter_with_column_restore

In [None]:
# INPUT_READER = HuggingFaceDatasetReader('alexantonov/chuvash_russian_parallel', text_key='ru', shuffle_files=False, dataset_options={'split':'train'})
INPUT_READER = HuggingFaceDatasetReader('alexantonov/chuvash_russian_parallel', text_key='ru', shuffle_files=False, dataset_options={'split':'train'}, adapter=_reader_adapter_with_column_info, doc_progress=True)

In [None]:
# gen = INPUT_READER.run()
# doc = next(iter(gen))
# doc.metadata['chv']

In [None]:
# doc.metadata

In [None]:
# import dataclasses

# data = {key: val for key, val in dataclasses.asdict(doc).items() if val}

In [None]:
# data |= data.pop("metadata")

In [None]:
num_buckets = 40
hashes_per_bucket = 3
n_grams = 4
precision = 64
language = "rus_Cyrl"

total_tasks = 6
workers = 12

BASE_MINHASH_PATH = "/mnt/d/vscode_projects/eeve/eeve/workdir/minhash_tests"
minhash_base_path = f"{BASE_MINHASH_PATH}/minhash_parallel"
base_logging_dir = f"{BASE_MINHASH_PATH}/minhash_logs_parallel"
local_working_dir = f'{BASE_MINHASH_PATH}/minhash_local_parallel'

upload_path = 'whatisslove11/dedup_chv_parallel_overall_v123455644'

In [None]:
minhash_config = MinhashConfig(
    hash_config=HashConfig(
        precision=precision,
        hash_fc="sha1"
    ),
    n_grams=n_grams,
    num_buckets=num_buckets,
    hashes_per_bucket=hashes_per_bucket,
)

stage1 = LocalPipelineExecutor(
    pipeline=[
        INPUT_READER,
        MinhashDedupSignature(
            output_folder=f"{minhash_base_path}/signatures",
            config=minhash_config,
            language=language
        ),
    ],
    logging_dir=f'{base_logging_dir}/signatures',
    tasks=total_tasks,
    workers=workers
)

stage2 = LocalPipelineExecutor(
    pipeline=[
        MinhashDedupBuckets(
            input_folder=f"{minhash_base_path}/signatures",
            output_folder=f"{minhash_base_path}/buckets",
            config=minhash_config,
        ),
    ],
    depends=stage1,
    logging_dir=f'{base_logging_dir}/buckets',
    tasks=minhash_config.num_buckets,
    workers=workers
)

stage3 = LocalPipelineExecutor(
    pipeline=[
        MinhashDedupCluster(
            input_folder=f"{minhash_base_path}/buckets",
            output_folder=f"{minhash_base_path}/remove_ids",
            config=minhash_config,
        ),
    ],
    depends=stage2,
    logging_dir=f'{base_logging_dir}/clusters',
    tasks=1,
    workers=workers
)

stage4 = LocalPipelineExecutor(
    pipeline=[
        INPUT_READER,
        TokensCounter(),  
        MinhashDedupFilter(
            input_folder=f"{minhash_base_path}/remove_ids",
            exclusion_writer=JsonlWriter(f"{minhash_base_path}/removed"),
        ),
        HuggingFaceDatasetWriter(
            dataset=upload_path,
            private=True,
            local_working_dir=local_working_dir, # –Ω–µ–æ–±—è–∑ –∞—Ä–≥
            output_filename="data/${rank}.parquet",
            cleanup=True, # –Ω—É–∂–Ω–æ –ª–∏ —É–¥–∞–ª—è—Ç—å —Å –ª–æ–∫–∞–ª—å–Ω–æ–π –ø–∞–ø–∫–∏ –¥–∞—Ç–∞—Å–µ—Ç –ø–æ—Ç–æ–º
            adapter=_writer_adapter_with_column_restore
        ),
    ],
    depends=stage3,
    logging_dir=f'{base_logging_dir}/clusters',
    tasks=minhash_config.num_buckets,
    workers=workers
)

stage4.run()

In [None]:
import pandas as pd
jsonObj = pd.read_json(path_or_buf='/mnt/d/vscode_projects/eeve/notebooks/minhash_parallel/removed/00001.jsonl.gz', lines=True)

In [None]:
jsonObj['text'][13]

In [None]:
from datasets import load_dataset

In [None]:
dd = load_dataset('whatisslove11/dedup_chv_parallel_overall_v1234567') # upload_path

In [None]:
xfd = dd['train']['ru']

In [None]:
len(xfd)

In [None]:
from tqdm.auto import tqdm


def find_substring_in_list(list_of_strings, substring):
    for string_item in tqdm(list_of_strings):
        if substring in string_item:
            return True, string_item
    return False, ""

In [None]:
find_substring_in_list(dd['train']['ru'], '13011000112115200036992')

In [None]:
import struct
import glob

def read_remove_file(filename):
    """–ß–∏—Ç–∞–µ—Ç —Ñ–∞–π–ª .remove –∏ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç —Å–ø–∏—Å–æ–∫ —á–∏—Å–µ–ª"""
    try:
        with open(filename, 'rb') as f:
            data = f.read()
            numbers = []
            # –ß–∏—Ç–∞–µ–º –ø–æ 4 –±–∞–π—Ç–∞ (—Ä–∞–∑–º–µ—Ä –æ–¥–Ω–æ–≥–æ —á–∏—Å–ª–∞)
            for i in range(0, len(data), 4):
                chunk = data[i:i+4]
                if len(chunk) < 4:
                    print(f"‚ö†Ô∏è –ù–µ–ø–æ–ª–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –≤ {filename}: –æ—Å—Ç–∞—Ç–æ–∫ {len(chunk)} –±–∞–π—Ç")
                    break
                # –†–∞—Å–ø–∞–∫–æ–≤—ã–≤–∞–µ–º —á–∏—Å–ª–æ
                num = struct.unpack("<I", chunk)[0]  # <I = little-endian unsigned int
                numbers.append(num)
            return numbers
    except Exception as e:
        print(f"üö® –û—à–∏–±–∫–∞ –ø—Ä–∏ —á—Ç–µ–Ω–∏–∏ {filename}: {e}")
        return []

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è:
if __name__ == "__main__":
    for filepath in glob.glob("/mnt/d/vscode_projects/eeve/notebooks/minhash_parallel/remove_ids/*.remove"):
        numbers = read_remove_file(filepath)
        print(f"üìÅ {filepath} —Å–æ–¥–µ—Ä–∂–∏—Ç —á–∏—Å–ª–∞: {numbers}")

In [None]:
PUNCTUATION = "!/‚Äî‚Äù:ÔºÖÔºë„Äà&(„ÄÅ‚îÅ\\„Äê#%„Äå„ÄçÔºå„ÄëÔºõ+^]~‚Äú„Ää‚Äû';‚Äô{|‚à∂¬¥[=-`*ÔºéÔºà‚ÄìÔºüÔºÅÔºö$ÔΩû¬´„Äâ,><„Äã)?Ôºâ„ÄÇ‚Ä¶@_.\"}‚ñ∫¬ª" + "".join(
    map(
        chr,
        (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)),
    )
)

In [None]:
# SONNET
from datatrove.executor.local import LocalPipelineExecutor
from datatrove.pipeline.dedup import MinhashDedupSignature
from datatrove.pipeline.dedup.minhash import (
    MinhashConfig,
    MinhashDedupBuckets,
    MinhashDedupCluster,
    MinhashDedupFilter,
    MinhashBuildIndex
)
from datatrove.pipeline.readers import JsonlReader
from datatrove.pipeline.writers.jsonl import JsonlWriter
from datatrove.utils.hashing import HashConfig
from datatrove.utils.typeshelper import Languages
from pathlib import Path

# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏ MinHash
minhash_config = MinhashConfig(
    hash_config=HashConfig(precision=64),
    num_buckets=14,
    hashes_per_bucket=8,
)

# –ü—É—Ç–∏ –∫ –¥–∞–Ω–Ω—ã–º
BASE_PATH = Path("./minhash_dedup")
LOGS_FOLDER = BASE_PATH / "logs"
FIRST_DATASET_PATH = "path/to/first/dataset"  # –ø—É—Ç—å –∫ –ø–µ—Ä–≤–æ–º—É (—ç—Ç–∞–ª–æ–Ω–Ω–æ–º—É) –¥–∞—Ç–∞—Å–µ—Ç—É
SECOND_DATASET_PATH = "path/to/second/dataset"  # –ø—É—Ç—å –∫–æ –≤—Ç–æ—Ä–æ–º—É –¥–∞—Ç–∞—Å–µ—Ç—É
TOTAL_TASKS = 100  # –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∑–∞–¥–∞—á, –º–æ–∂–Ω–æ –Ω–∞—Å—Ç—Ä–æ–∏—Ç—å –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç —Ä–∞–∑–º–µ—Ä–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞

# –°–æ–∑–¥–∞–µ–º –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏
BASE_PATH.mkdir(exist_ok=True)
LOGS_FOLDER.mkdir(exist_ok=True)

# –≠—Ç–∞–ø 1: –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Å–∏–≥–Ω–∞—Ç—É—Ä –¥–ª—è –ø–µ—Ä–≤–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞
stage1_first = LocalPipelineExecutor(
    pipeline=[
        JsonlReader(FIRST_DATASET_PATH),
        MinhashDedupSignature(
            output_folder=BASE_PATH / "signatures_first",
            config=minhash_config,
            language=Languages.english
        ),
    ],
    tasks=TOTAL_TASKS,
    workers=8,  # –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω—ã—Ö –∑–∞–¥–∞—á
    logging_dir=LOGS_FOLDER / "signatures_first",
)

# –≠—Ç–∞–ø 2: –°–æ–∑–¥–∞–Ω–∏–µ –∏–Ω–¥–µ–∫—Å–∞ –∏–∑ —Å–∏–≥–Ω–∞—Ç—É—Ä –ø–µ—Ä–≤–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞
stage2_index = LocalPipelineExecutor(
    pipeline=[
        MinhashBuildIndex(
            input_folder=BASE_PATH / "signatures_first",
            output_folder=BASE_PATH / "index_first",
            index_name="first_dataset",
            config=minhash_config,
        ),
    ],
    tasks=minhash_config.num_buckets,  # –ø–æ –æ–¥–Ω–æ–π –∑–∞–¥–∞—á–µ –Ω–∞ –∫–∞–∂–¥—ã–π –±–∞–∫–µ—Ç
    workers=minhash_config.num_buckets,
    logging_dir=LOGS_FOLDER / "index_first",
    depends=stage1_first,
)

# –≠—Ç–∞–ø 3: –í—ã—á–∏—Å–ª–µ–Ω–∏–µ —Å–∏–≥–Ω–∞—Ç—É—Ä –¥–ª—è –≤—Ç–æ—Ä–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞
stage3_second = LocalPipelineExecutor(
    pipeline=[
        JsonlReader(SECOND_DATASET_PATH),
        MinhashDedupSignature(
            output_folder=BASE_PATH / "signatures_second",
            config=minhash_config,
            language=Languages.english
        ),
    ],
    tasks=TOTAL_TASKS,
    workers=8,
    logging_dir=LOGS_FOLDER / "signatures_second",
    depends=stage2_index,
)

# –≠—Ç–∞–ø 4: –ù–∞—Ö–æ–∂–¥–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –º–µ–∂–¥—É –≤—Ç–æ—Ä—ã–º –¥–∞—Ç–∞—Å–µ—Ç–æ–º –∏ –∏–Ω–¥–µ–∫—Å–æ–º –ø–µ—Ä–≤–æ–≥–æ
# –ò—Å–ø–æ–ª—å–∑—É–µ–º only_dedup_in_index=True —á—Ç–æ–±—ã —Å—Ä–∞–≤–Ω–∏–≤–∞—Ç—å —Ç–æ–ª—å–∫–æ —Å –∏–Ω–¥–µ–∫—Å–æ–º
stage4_buckets = LocalPipelineExecutor(
    pipeline=[
        MinhashDedupBuckets(
            input_folder=BASE_PATH / "signatures_second",
            output_folder=BASE_PATH / "buckets_second",
            index_folder=BASE_PATH / "index_first",  # –∏—Å–ø–æ–ª—å–∑—É–µ–º –∏–Ω–¥–µ–∫—Å –ø–µ—Ä–≤–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞
            config=minhash_config,
            only_dedup_in_index=True,  # —Ç–æ–ª—å–∫–æ –¥–µ–¥—É–ø–ª–∏–∫–∞—Ü–∏—è –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ –∏–Ω–¥–µ–∫—Å–∞
        ),
    ],
    tasks=minhash_config.num_buckets,
    workers=minhash_config.num_buckets,
    logging_dir=LOGS_FOLDER / "buckets_second",
    depends=stage3_second,
)

# –≠—Ç–∞–ø 5: –ö–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤
stage5_cluster = LocalPipelineExecutor(
    pipeline=[
        MinhashDedupCluster(
            input_folder=BASE_PATH / "buckets_second",
            output_folder=BASE_PATH / "remove_ids_second",
            config=minhash_config,
            ignore_index_matches=False,  # –≤–∞–∂–Ω–æ: –º—ã —Ö–æ—Ç–∏–º —É—á–∏—Ç—ã–≤–∞—Ç—å —Å–æ–≤–ø–∞–¥–µ–Ω–∏—è —Å –∏–Ω–¥–µ–∫—Å–æ–º
        ),
    ],
    tasks=1,  # –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è –≤—ã–ø–æ–ª–Ω—è–µ—Ç—Å—è –≤ –æ–¥–Ω–æ–π –∑–∞–¥–∞—á–µ
    workers=1,
    logging_dir=LOGS_FOLDER / "clusters_second",
    depends=stage4_buckets,
)

# –≠—Ç–∞–ø 6: –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –≤—Ç–æ—Ä–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ (—É–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤)
stage6_filter = LocalPipelineExecutor(
    pipeline=[
        JsonlReader(SECOND_DATASET_PATH),
        MinhashDedupFilter(
            input_folder=BASE_PATH / "remove_ids_second",
            exclusion_writer=JsonlWriter(BASE_PATH / "removed_from_second"),  # –æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ: —Å–æ—Ö—Ä–∞–Ω—è–µ–º —É–¥–∞–ª–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã
        ),
        JsonlWriter(output_folder=BASE_PATH / "deduplicated_second"),  # —Å–æ—Ö—Ä–∞–Ω—è–µ–º –¥–µ–¥—É–ø–ª–∏—Ü–∏—Ä–æ–≤–∞–Ω–Ω—ã–π –≤—Ç–æ—Ä–æ–π –¥–∞—Ç–∞—Å–µ—Ç
    ],
    tasks=TOTAL_TASKS,
    workers=8,
    logging_dir=LOGS_FOLDER / "filter_second",
    depends=stage5_cluster,
)

# –ó–∞–ø—É—Å–∫ –ø–∞–π–ø–ª–∞–π–Ω–∞
if __name__ == "__main__":
    # –ó–∞–ø—É—Å–∫ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ —ç—Ç–∞–ø–∞ –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏ –∑–∞–ø—É—Å—Ç–∏—Ç –≤—Å–µ –ø—Ä–µ–¥—ã–¥—É—â–∏–µ
    stage6_filter.run()

In [None]:
# DEEPSEEK
from datatrove.executor.local import LocalPipelineExecutor
from datatrove.pipeline.dedup.minhash import (
    MinhashConfig,
    MinhashDedupSignature,
    MinhashDedupBuckets,
    MinhashDedupCluster,
    MinhashDedupFilter,
)
from datatrove.pipeline.readers import JsonlReader
from datatrove.pipeline.writers.jsonl import JsonlWriter
from datatrove.utils.hashing import HashConfig


# –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è MinHash (–º–æ–∂–Ω–æ –Ω–∞—Å—Ç—Ä–æ–∏—Ç—å –ø–æ–¥ —Å–≤–æ–∏ –Ω—É–∂–¥—ã)
minhash_config = MinhashConfig(
    hash_config=HashConfig(precision=64),
    num_buckets=14,
    hashes_per_bucket=8,
)

# –ü—É—Ç–∏ –∫ –¥–∞–Ω–Ω—ã–º
DATASET1_PATH = "path/to/dataset1"  # –ü–µ—Ä–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç (–±–∞–∑–∞ –¥–ª—è –¥–µ–¥—É–ø–ª–∏–∫–∞—Ü–∏–∏)
DATASET2_PATH = "path/to/dataset2"  # –í—Ç–æ—Ä–æ–π –¥–∞—Ç–∞—Å–µ—Ç (–∏–∑ –Ω–µ–≥–æ —É–¥–∞–ª—è–µ–º –¥—É–±–ª–∏)
OUTPUT_BASE = "path/to/output"      # –ü–∞–ø–∫–∞ –¥–ª—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

# –≠—Ç–∞–ø 1: –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Å–∏–≥–Ω–∞—Ç—É—Ä –¥–ª—è –æ–±–æ–∏—Ö –¥–∞—Ç–∞—Å–µ—Ç–æ–≤
stage1_dataset1 = LocalPipelineExecutor(
    pipeline=[
        JsonlReader(DATASET1_PATH),
        MinhashDedupSignature(
            output_folder=f"{OUTPUT_BASE}/signatures_dataset1",
            config=minhash_config,
        ),
    ],
    tasks=100,  # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∑–∞–¥–∞—á (—Ñ–∞–π–ª–æ–≤) –≤ –¥–∞—Ç–∞—Å–µ—Ç–µ
)

stage1_dataset2 = LocalPipelineExecutor(
    pipeline=[
        JsonlReader(DATASET2_PATH),
        MinhashDedupSignature(
            output_folder=f"{OUTPUT_BASE}/signatures_dataset2",
            config=minhash_config,
        ),
    ],
    tasks=100,
    depends=stage1_dataset1,  # –ó–∞–ø—É—Å—Ç–∏—Ç—Å—è –ø–æ—Å–ª–µ –∑–∞–≤–µ—Ä—à–µ–Ω–∏—è stage1_dataset1
)

# –≠—Ç–∞–ø 2: –ü–æ–∏—Å–∫ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –º–µ–∂–¥—É –¥–∞—Ç–∞—Å–µ—Ç–∞–º–∏
stage2 = LocalPipelineExecutor(
    pipeline=[
        MinhashDedupBuckets(
            input_folder=f"{OUTPUT_BASE}/signatures_dataset2",  # –°–∏–≥–Ω–∞—Ç—É—Ä—ã –≤—Ç–æ—Ä–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞
            output_folder=f"{OUTPUT_BASE}/buckets",
            index_folder=f"{OUTPUT_BASE}/signatures_dataset1",  # –°–∏–≥–Ω–∞—Ç—É—Ä—ã –ø–µ—Ä–≤–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞ (–∏–Ω–¥–µ–∫—Å)
            config=minhash_config,
            only_dedup_in_index=True,  # –ò–≥–Ω–æ—Ä–∏—Ä—É–µ–º –¥—É–±–ª–∏ –≤–Ω—É—Ç—Ä–∏ dataset2
        ),
    ],
    tasks=minhash_config.num_buckets,  # –û–¥–Ω–∞ –∑–∞–¥–∞—á–∞ –Ω–∞ –≤–µ–¥—Ä–æ
    depends=stage1_dataset2,
)

# –≠—Ç–∞–ø 3: –ö–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤
stage3 = LocalPipelineExecutor(
    pipeline=[
        MinhashDedupCluster(
            input_folder=f"{OUTPUT_BASE}/buckets",
            output_folder=f"{OUTPUT_BASE}/remove_ids",
            config=minhash_config,
        ),
    ],
    tasks=1,
    depends=stage2,
)

# –≠—Ç–∞–ø 4: –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤
stage4 = LocalPipelineExecutor(
    pipeline=[
        JsonlReader(DATASET2_PATH),  # –ß–∏—Ç–∞–µ–º –∏—Å—Ö–æ–¥–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ (dataset2)
        MinhashDedupFilter(
            input_folder=f"{OUTPUT_BASE}/remove_ids",
            exclusion_writer=JsonlWriter(f"{OUTPUT_BASE}/removed"),  # –°–æ—Ö—Ä–∞–Ω—è–µ–º —É–¥–∞–ª–µ–Ω–Ω—ã–µ –¥—É–±–ª–∏
        ),
        JsonlWriter(output_folder=f"{OUTPUT_BASE}/deduplicated_output"),  # –†–µ–∑—É–ª—å—Ç–∞—Ç
    ],
    tasks=100,  # –î–æ–ª–∂–Ω–æ —Å–æ–≤–ø–∞–¥–∞—Ç—å —Å stage1_dataset2.tasks
    depends=stage3,
)

# –ó–∞–ø—É—Å–∫ –≤—Å–µ–≥–æ –ø–∞–π–ø–ª–∞–π–Ω–∞
stage4.run()