In [2]:
import os
import time
from dask.distributed import Client
import warnings
import dask.dataframe as dd
import dask_cudf
import cudf
import gzip
import json
import dask.bag as db
import glob
from dask.distributed import wait
import numpy as np

from nemo_curator import get_client
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import (
    get_num_workers,
    read_data,
    write_to_disk,
)
from nemo_curator.utils.file_utils import (
    expand_outdir_and_mkdir, 
    get_all_files_paths_under, 
    separate_by_metadata,
    get_batched_files,
)

warnings.filterwarnings('ignore')
base_dir = "/home/neelesh"

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
sched_addr = os.environ.get('SCHEDULER_ADDRESS')
gpu_client = get_client(cluster_type = 'gpu', set_torch_to_use_rmm=False)

cuDF Spilling is enabled


In [8]:
from nemo_curator import MinHash

input_data_dir = os.path.join(base_dir,"clean_c4_en_cleaned")
seed = 42
minhash_length = 260
char_ngram = 5
log_dir = expand_outdir_and_mkdir(os.path.join(base_dir, "logs"))
id_field = 'id'
text_field = 'text'
minshah_output_dir = expand_outdir_and_mkdir(os.path.join(base_dir,"00_c4_minhash"))

In [9]:
files = get_all_files_paths_under(root=input_data_dir, recurse_subdirectories=False)
files = [f for f in files if f.endswith(".jsonl")]
df = read_data(
    files,
    file_type="jsonl",
    backend="cudf",
    files_per_partition=1,
    add_filename=False,
)[[id_field, text_field]]

Reading 1792 files


In [10]:
t0 = time.time()

# Run MinHash() on input data
minhasher = MinHash(
    seed=seed,
    num_hashes=minhash_length,
    char_ngrams=char_ngram,
    use_64bit_hash=False,
    logger=log_dir,
    id_field=id_field,
    text_field=text_field,
    cache_dir=minshah_output_dir
)

result = minhasher(DocumentDataset(df)).df

print(f"Computing minhashes took:{time.time()-t0}")

Computing minhashes took:321.2562484741211


In [11]:
result.head()

Unnamed: 0,id,_minhash_signature
0,c4-train-0000000000,"[30032382, 157261, 5008033, 4311555, 19755091,..."
1,c4-train-0000000001,"[511522, 1015487, 2320335, 651428, 1906819, 14..."
2,c4-train-0000000002,"[15994705, 6370213, 15559465, 9740304, 6210120..."
3,c4-train-0000000003,"[1449615, 1872293, 3654170, 452331, 780352, 39..."
4,c4-train-0000000004,"[6685476, 2415781, 1112347, 742646, 6898911, 4..."


In [12]:
from nemo_curator import LSH
from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int

lsh_input_dir = os.path.join(base_dir,"00_c4_minhash")
id_field = 'id'
output_bucket_dir = expand_outdir_and_mkdir(os.path.join(base_dir,"01_c4_fuzzy_dedup_output"))
num_bands = 20
buckets_per_shuffle = 1
minhash_field = '_minhash_signature'
minhash_length = 260
log_dir = os.path.join(base_dir, "NEMO_DATA/logs")

In [14]:
t0 = time.time()

#Load MinHash output
df = dask_cudf.read_parquet(lsh_input_dir, blocksize="2GB", aggregate_files=True)
df = df.map_partitions(
    convert_str_id_to_int,
    id_column=id_field,
    meta=cudf.DataFrame(
        {minhash_field: [[1, 2, 3]], "doc_id": [1], "dataset_id": np.uint32(1)}
    ),
)

lsh = LSH(
    cache_dir=output_bucket_dir,
    num_hashes=minhash_length,
    num_buckets=num_bands,
    buckets_per_shuffle=buckets_per_shuffle,
    id_fields=["dataset_id", "doc_id"],
    minhash_field=minhash_field,
    logger=log_dir,
)

lsh_result = lsh(DocumentDataset(df))
print(f"LSH took {time.time()-t0} s")

LSH took 857.9130506515503 s


In [15]:
lsh_result.df.head()

Unnamed: 0,dataset_id,doc_id,_bucket_id
0,2191958705,36525884,6050
1,2191958705,52458138,77
2,2191958705,37051556,2
3,2191958705,60789048,5475
4,2191958705,43778021,1359


In [16]:
from nemo_curator.modules.fuzzy_dedup import _MapBuckets
from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
    get_bucket_ddf_from_parquet_path,
    get_text_ddf_from_json_path_with_blocksize,
)

input_data_paths = [os.path.join(base_dir,"clean_c4_en_cleaned")]
num_files = None
text_ddf_blocksize = 256 #The block size for chunking jsonl files for text ddf in mb
id_field = 'id'
text_field = 'text'
input_bucket_path = os.path.join(base_dir,"01_c4_fuzzy_dedup_output/_buckets.parquet")
input_bucket_field = '_bucket_id'
shuffle_type ='tasks'
log_dir = os.path.join(base_dir, "logs")
output_anchor_docs_with_bk_path = expand_outdir_and_mkdir(os.path.join(base_dir,"01_c4_fuzzy_dedup_output/anchor_docs_with_bk.parquet"))

In [17]:
ddf_text = get_text_ddf_from_json_path_with_blocksize(
    input_data_paths=input_data_paths,
    num_files=num_files,
    blocksize=text_ddf_blocksize,
    id_column=id_field,
    text_column=text_field,
)

print(f"ddf_text.npartitions  = {ddf_text.npartitions}", flush=True)

Number of files being read for jaccard calculation = 1792
ddf_text.npartitions  = 896


In [18]:
t0 = time.time()
num_workers = get_num_workers(gpu_client)

# Read "_buckets.parquet"
ddf_bk = get_bucket_ddf_from_parquet_path(
    input_bucket_path=input_bucket_path, 
    num_workers=num_workers
)

#Run _MapBuckets()
map_buckets = _MapBuckets(
    id_fields=["dataset_id", "doc_id"], 
    bucket_field=input_bucket_field, 
    logger=log_dir,
    text_field=text_field,
)

ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(
    documents_df=ddf_text, 
    buckets_df=ddf_bk, 
    shuffle_type=shuffle_type
)

#Write to disk
ddf_anchor_docs_with_bk.to_parquet(
    output_anchor_docs_with_bk_path, 
    write_index=False
)

print(f"Mapping Bucket took {time.time()-t0} s")

Number of ddf_bk partitions = 4
Mapping Bucket took 88.8824942111969 s


In [19]:
ddf_anchor_docs_with_bk.head()

Unnamed: 0,dataset_id,doc_id,anchor_1_dataset_id,anchor_1_doc_id,anchor_0_dataset_id,anchor_0_doc_id,_output_partition_id
0,2191958705,32549647,2191958705,31358312,2191958705,89594282,9
1,2191958705,35098816,2191958705,85047220,2191958705,51389075,5
2,2191958705,40595502,2191958705,15042562,2191958705,61602848,10
3,2191958705,20786631,2191958705,62139963,2191958705,640427,9
4,2191958705,51827304,2191958705,51827304,2191958705,76096690,12


In [20]:
from nemo_curator.modules.fuzzy_dedup import _Shuffle

log_dir = os.path.join(base_dir, "logs")
input_anchor_docs_with_bk_path = os.path.join(base_dir,"01_c4_fuzzy_dedup_output/anchor_docs_with_bk.parquet")
output_shuffled_docs_path = expand_outdir_and_mkdir(
    os.path.join(base_dir, "01_c4_fuzzy_dedup_output/shuffled_docs.parquet")
)
bucket_mapping_ddf_blocksize = 256
parts_per_worker = 16
bucket_parts_per_worker = 256
id_field = 'id'
text_field = 'text'

In [21]:
t0 = time.time()

shuffle = _Shuffle(
    id_fields=["dataset_id", "doc_id"],
    text_field=text_field,
    int_to_str_id=id_field,
    logger=log_dir,
)

shuffle.shuffle_docs_on_buckets(
    documents_df=ddf_text,
    bucket_w_anchors_path=input_anchor_docs_with_bk_path,
    output_shuffled_docs_path=output_shuffled_docs_path,
    bucket_mapping_df_blocksize=bucket_mapping_ddf_blocksize,
    parts_per_worker=parts_per_worker,
    bucket_parts_per_worker=bucket_parts_per_worker,
    partition_on="_output_partition_id",
)

print(f"Jaccard Shuffle took {time.time()-t0} s")

  0%|          | 0/1 [00:00<?, ?it/s]


Started processing bucket-map partitions 0 through 4 of 4
Using 64 text partitions.
Starting text bytes aware shuffle
Will write 2994466 rows to disk
Text-df partition  64/896 completed in 31.12666344642639
Using 64 text partitions.
Starting text bytes aware shuffle
Will write 2999951 rows to disk
Text-df partition  128/896 completed in 33.10323452949524
Using 64 text partitions.
Starting text bytes aware shuffle
Will write 2988432 rows to disk
Text-df partition  192/896 completed in 33.50847387313843
Using 64 text partitions.
Starting text bytes aware shuffle
Will write 2990629 rows to disk
Text-df partition  256/896 completed in 33.49195647239685
Using 64 text partitions.
Starting text bytes aware shuffle
Will write 2986183 rows to disk
Text-df partition  320/896 completed in 31.32063102722168
Using 64 text partitions.
Starting text bytes aware shuffle
Will write 2992957 rows to disk
Text-df partition  384/896 completed in 31.166403770446777
Using 64 text partitions.
Starting text b

100%|██████████| 1/1 [07:45<00:00, 465.34s/it]

Jaccard Shuffle took 465.44443917274475 s





In [22]:
jaccard_shuffle_res = dd.read_parquet(os.path.join(output_shuffled_docs_path,"_output_partition_id=0"))
jaccard_shuffle_res.head()

Unnamed: 0,text,_text_bytes,id,anchor_0_id,anchor_1_id,_output_partition_id
0,"This is a placeholder page for Sarah Benesh, w...",488,2191958705-5801337,2191958705-59295672,2191958705-35642137,0
1,The Columbus Hotel Tenerife is free HD wallpap...,271,2191958705-6360534,2191958705-69858464,2191958705-62562517,0
2,"Here at Low Water Pressure Guys, we will be re...",2361,2191958705-5498196,2191958705-41702786,2191958705-52826321,0
3,Baby Gate Guys will be there for all your goal...,2336,2191958705-2697785,2191958705-73574607,2191958705-12714317,0
4,Los Angeles Rams Baby Clothes is free HD wallp...,273,2191958705-214268,2191958705-77047710,2191958705-69858464,0


In [23]:
from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity

id_field = 'id'
text_field = 'text'
ngram_size = 5
shuffled_docs_path = os.path.join(base_dir, "01_c4_fuzzy_dedup_output/shuffled_docs.parquet")
jaccard_results_path = expand_outdir_and_mkdir(
    os.path.join(base_dir, "01_c4_fuzzy_dedup_output/jaccard_similarity_results.parquet")
)

In [24]:
t0 = time.time()
jaccard = JaccardSimilarity(
    id_field=id_field ,
    text_field=text_field,
    anchor_id_fields=[f"anchor_{i}_{id_field}" for i in range(2)],
    ngram_width=ngram_size,
)

# Run actual computation
result_df = jaccard.jaccard_compute(shuffled_docs_path)

result_df.to_parquet(
    jaccard_results_path,
    write_index=False,
    write_metadata_file=False,
)

print(f"Jaccard Computing+Writing took {time.time() - t0} seconds")

Jaccard Computing+Writing took 16.437445402145386 seconds


In [25]:
jaccard_compute_res = dd.read_parquet(jaccard_results_path)
jaccard_compute_res.head()

Unnamed: 0,id_x,id_y,jaccard
0,2191958705-7885144,2191958705-9304226,0.885671
1,2191958705-11805797,2191958705-11093133,0.898551
2,2191958705-10927567,2191958705-8191823,0.685083
3,2191958705-12284408,2191958705-6891291,0.865922
4,2191958705-18809012,2191958705-12002623,0.956098


In [26]:
from nemo_curator.modules.fuzzy_dedup import ConnectedComponents

cache_dir = expand_outdir_and_mkdir(
    os.path.join(base_dir, "01_c4_fuzzy_dedup_output/cc-cache")
)
jaccard_pairs_path = os.path.join(base_dir, "01_c4_fuzzy_dedup_output/jaccard_similarity_results.parquet")
id_field = 'id'
jaccard_threshold = 0.8
output_path = expand_outdir_and_mkdir(
    os.path.join(base_dir, "01_c4_fuzzy_dedup_output/connected_components.parquet")
)

In [28]:
t0 = time.time()
components_stage = ConnectedComponents(
    cache_dir=cache_dir,
    jaccard_pairs_path=jaccard_pairs_path,
    id_column=id_field,
    # convert_str_ids=True,
    jaccard_threshold=jaccard_threshold,
)
components_stage.cc_workflow(output_path=output_path)
print(f"Connected Component took {time.time()-t0} seconds")

Connected Component took 18.83707284927368 seconds


In [32]:
def split_id(df):
    # Convert to cudf DataFrame if it's not already
    df = cudf.DataFrame(df)
    # Split the id into components
    df['dataset_id'] = df['id'].astype('string').str.split('-').str.get(0)
    df['doc_id'] = df['id'].astype('string').str.split('-').str.get(1)
    return df

In [None]:
output_path = os.path.join(base_dir, "01_c4_fuzzy_dedup_output/connected_components.parquet")
cc_result = dask_cudf.read_parquet(output_path, split_row_groups=False).repartition(npartitions=1)

df = cc_result.compute()
df['id_str'] = df['id'].astype(str)
df[['dataset_id', 'doc_id']] = df['id_str'].str.split('-', expand=True)
cc_result = dask_cudf.from_cudf(df, npartitions=1)

# Set 'group' as the index and shuffle to ensure all same 'group' values are in the same partition
cc_result = cc_result.set_index('group', shuffle='tasks')

cc_result.head()

Unnamed: 0_level_0,id,id_str,dataset_id,doc_id
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2191958705-51036593,2191958705-51036593,2191958705,51036593
0,2191958705-48768313,2191958705-48768313,2191958705,48768313
0,2191958705-40155685,2191958705-40155685,2191958705,40155685
0,2191958705-3159672,2191958705-3159672,2191958705,3159672
0,2191958705-3276793,2191958705-3276793,2191958705,3276793


In [37]:
# drop the id_str column
cc_result = cc_result.drop(columns=['id_str'])

# Define a function to assign cumulative counts and filter duplicates
def assign_cumcount(df):
    df['cumcount'] = df.groupby(level=0).cumcount()
    df = df[df['cumcount'] >= 1]
    df = df.drop(columns=['cumcount'])
    return df

# Find duplicates by applying the function to each partition
docs_to_remove = cc_result.map_partitions(assign_cumcount, meta=cc_result)

# Reset the index
docs_to_remove = docs_to_remove.reset_index()

docs_to_remove = docs_to_remove[["dataset_id", "doc_id"]]
docs_to_remove = docs_to_remove.rename(columns={"dataset_id":"to_remove_dataset_id", "doc_id":"to_remove_doc_id"})
docs_to_remove = docs_to_remove.reset_index(drop=True).persist()
_ = wait(docs_to_remove)
del _ 

print("num of docs to remove =", len(docs_to_remove))

num of docs to remove = 4252681


In [38]:
cc_grouped = cc_result.groupby('group').agg({'doc_id': 'count'}).rename(columns={'doc_id': 'count'}).sort_values('count', ascending=False).compute()
cc_grouped.head()

Unnamed: 0_level_0,count
group,Unnamed: 1_level_1
11,119096
4,58505
0,26948
4140650,24922
1594161,13994


In [39]:
dup_group = cc_result.loc[11].compute()
dup_group.head()

Unnamed: 0_level_0,id,dataset_id,doc_id
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,2191958705-83685852,2191958705,83685852
11,2191958705-76068122,2191958705,76068122
11,2191958705-81946381,2191958705,81946381
11,2191958705-83353108,2191958705,83353108
11,2191958705-43359258,2191958705,43359258


In [40]:
# read input dataset
input_data_dir = os.path.join(base_dir, "clean_c4_en_cleaned")
input_dataset = DocumentDataset.read_json(input_data_dir, add_filename=True)

Reading 1792 files


In [42]:
t0 = time.time()
dup_ids = [
    'c4-train-0083685852',
    'c4-train-0076068122',
    'c4-train-0081946381',
    'c4-train-0083353108',
    'c4-train-0043359258'
] 
dup_examples = input_dataset.df[input_dataset.df['id'].isin(dup_ids)].compute()
print(f"Searching for near duplicate examples with specific IDs took {time.time()-t0} seconds")

Searching for near duplicate examples with specific IDs took 1158.019103050232 seconds


In [43]:
dup_examples

Unnamed: 0,filename,id,language,text,timestamp,url
41271,c4-train00851.jsonl,c4-train-0043359258,EN,"This is a placeholder page for Mary Dean, whic...",2019-04-18 10:39:38,http://stmaryshighschool.net/phoenix-az/alumni...
19473,c4-train01494.jsonl,c4-train-0076068122,EN,This is a placeholder page for Brittany Bokenk...,2019-04-24 22:17:20,http://comeauxhighschool.org/alumni/1134075/br...
44507,c4-train01609.jsonl,c4-train-0081946381,EN,"This is a placeholder page for Tim Howell, whi...",2019-04-22 10:20:58,http://grosseilehighschool.com/alumni/1876277/...
25729,c4-train01637.jsonl,c4-train-0083353108,EN,"This is a placeholder page for Lily Clements, ...",2019-04-20 18:16:05,http://polytechnichighschool.net/san_francisco...
2112,c4-train01644.jsonl,c4-train-0083685852,EN,"This is a placeholder page for Joshua Szafran,...",2019-04-23 04:18:59,http://griswoldhighschool.org/jewett_city-ct/a...


In [44]:
print('Example duplicate 1\n' + dup_examples.text.iloc[0])
print('\n\nExample duplicate 2\n' + dup_examples.text.iloc[1])
print('\n\nExample duplicate 3\n' + dup_examples.text.iloc[2])
print('\n\nExample duplicate 4\n' + dup_examples.text.iloc[3])
print('\n\nExample duplicate 4\n' + dup_examples.text.iloc[4])

Example duplicate 1
This is a placeholder page for Mary Dean, which means this person is not currently on this site. We do suggest using the tools below to find Mary Dean.
You are visiting the placeholder page for Mary Dean. This page is here because someone used our placeholder utility to look for Mary Dean. We created this page automatically in hopes Mary Dean would find it. If you are not Mary Dean, but are an alumni of St Marys High School Phoenix, AZ, register on this site for free now.


Example duplicate 2
This is a placeholder page for Brittany Bokenkamp, which means this person is not currently on this site. We do suggest using the tools below to find Brittany Bokenkamp.
You are visiting the placeholder page for Brittany Bokenkamp. This page is here because someone used our placeholder utility to look for Brittany Bokenkamp. We created this page automatically in hopes Brittany Bokenkamp would find it. If you are not Brittany Bokenkamp, but are an alumni of Comeaux High School,

In [45]:
from helper import convert_str_id_to_int

input_dataset = DocumentDataset.read_json(os.path.join(base_dir, "clean_c4_en_cleaned"), backend="cudf")
input_df = input_dataset.df[['text','id']]
meta = input_df._meta
meta['doc_id']=np.int64([0])
meta['dataset_id']=np.uint32([0])
input_df = input_df.map_partitions(
    convert_str_id_to_int,
    id_column="id",
    meta=meta,
)

Reading 1792 files


In [47]:
dedup_output_dir = expand_outdir_and_mkdir(os.path.join(base_dir, "02_c4_deduped"))

input_df['doc_id'] = input_df['doc_id'].astype(str)
input_df['dataset_id'] = input_df['dataset_id'].astype(str)
docs_to_remove['to_remove_doc_id'] = docs_to_remove['to_remove_doc_id'].astype(str)
docs_to_remove['to_remove_dataset_id'] = docs_to_remove['to_remove_dataset_id'].astype(str)

deduped_df = input_df.merge(docs_to_remove,
                             left_on=['doc_id','dataset_id'],
                             right_on=["to_remove_doc_id", "to_remove_dataset_id"],
                             how='left')

deduped_df = deduped_df[deduped_df['to_remove_doc_id'].isna()].drop(columns=['to_remove_doc_id', "to_remove_dataset_id"]).reset_index(drop=True)

t0 = time.time()
deduped_df.to_parquet(dedup_output_dir)
print(f"Removing duplicates and writing deduped dataset took {time.time()-t0} seconds")

Removing duplicates and writing deduped dataset took 145.94984221458435 seconds


In [48]:
len(deduped_df)

86606272

In [49]:
len(input_df)

90858953

90858953 - 86606272 = 4,258,681

In [50]:
# end the gpu client
gpu_client.cluster.close()
gpu_client.shutdown()





In [51]:
scheduler_address = os.getenv("SCHEDULER_ADDRESS")
cpu_client = get_client(scheduler_address=scheduler_address)
print(f"Num Workers = {get_num_workers(cpu_client)}", flush=True)

Num Workers = 128


In [54]:
import nemo_curator
from nemo_curator.utils.config_utils import build_filter_pipeline

filter_config_file = os.path.join(base_dir, "NEMO_DATA/config/heuristic_filter_en.yaml")
hf_input_data_dir = os.path.join(base_dir, "02_c4_deduped")
kept_document_dir =  expand_outdir_and_mkdir(os.path.join(base_dir,'03_c4_heuristic_filtering','hf.parquet'))

In [55]:
t0 = time.time()

# Load dataset
dataset = DocumentDataset.read_parquet(hf_input_data_dir)

# construct pipeline from config
filter_pipeline = build_filter_pipeline(filter_config_file)

# filter data and write to disk
filtered_dataset = filter_pipeline(dataset)
filtered_dataset.to_parquet(kept_document_dir)

print(f"Time taken for Heuristic filtering: {time.time()-t0} s")

Reading 1792 files
Writing to disk complete for 1792 partitions
Time taken for Heuristic filtering: 844.9139394760132 s


In [56]:
len(filtered_dataset)

77910950

In [57]:
from helper import get_dataframe_complement

original_df = dd.read_parquet(hf_input_data_dir)
filtered_df = dd.read_parquet(kept_document_dir)
removed_df = get_dataframe_complement(original_df, filtered_df)
removed_df_example = removed_df.head()

In [59]:
print(removed_df_example.text.iloc[0])

Rich Reviews initial goal is to provide a shop window for authors with an Oxford link. The Oxford Times is a great promoter of local talent and we will seek to complement this respected outlet by giving priority to those self-­published authors with the talent to be recognised by the established publishing houses.
We will attempt to be honest and positive in our reviews recognising that the achievement in producing a work of fact or fiction is generally a labour of love and fully deserving of a balanced appraisal. We will also be looking to associate the site with many of the local book clubs in the Oxford area giving those who have an opinion on all things literary the opportunity to have a voice and tell us what is good and bad in their reading groups.
Ultimately this is a new venture and we will be looking to review as many books as possible. We look forward to hearing from you and hope to keep you entertained not only through our reviews but also via our blog.
I am a book lover who

In [60]:
print(removed_df_example.text.iloc[1])

Chris, unexpectedly, decided we should head north on the weekend. *All* the way north. To Cape Reinga. Any further north, and we would have had to swim.
Of some significance (to me, the highways nerd) was that this is the start marker for SH1, which runs the length of both islands to south of Bluff. It's more than 2000km long. I've travelled the length of the North Island, but I'm yet to make it south of the Cook Strait.
As Chris has a new car to 'run in', we went via the scenic east coast on the way up, and the scenic west coast on the way back. The east coast roads tend to run closer to the coast. Along SH12 on the west coast, there are more wiggly roads and plenty of trees. I think Chris liked driving there better!
Doubtless Bay is freakin' gorgeous. It's on SH10 as you travel up the northeast coast.
Anyway, Cape Reinga is at least 6 hours north of Auckland (depends how fast you drive, and how many campervans/tourists you get stuck behind). It's of great significance to the Maori po

In [68]:
from huggingface_hub import login

login(token='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')

In [72]:
from huggingface_hub import HfApi
from datasets import load_dataset, Dataset
import pyarrow as pa
import pyarrow.parquet as pq
import glob
import os

def push_large_parquet_dataset(filtered_dataset_dir, repo_id):
    """
    Push large parquet dataset to HF handling files in chunks
    
    Args:
        filtered_dataset_dir: Directory containing parquet files
        repo_id: "username/dataset-name"
    """
    # Create the repository first
    api = HfApi()
    api.create_repo(repo_id, repo_type="dataset", exist_ok=True)
    
    # Get list of all parquet files
    parquet_files = glob.glob(os.path.join(filtered_dataset_dir, "*.parquet"))
    
    # Process each file separately
    for i, parquet_file in enumerate(parquet_files):
        # Upload the parquet file directly
        remote_path = f"data/part_{i:05d}.parquet"
        api.upload_file(
            path_or_fileobj=parquet_file,
            path_in_repo=remote_path,
            repo_id=repo_id,
            repo_type="dataset"
        )
        print(f"Uploaded file {i+1}/{len(parquet_files)}: {parquet_file}")
        
    # Add dataset metadata
    metadata = {
        "total_files": len(parquet_files),
        "data_files": [f"data/part_{i:05d}.parquet" for i in range(len(parquet_files))]
    }
    
    # Create a dataset card
    readme_content = f"""
---
license: apache-2.0
---

# C4 Curated Dataset
This dataset contains curated and filtered documents from the C4 dataset.

Total number of parquet files: {len(parquet_files)}
    """
    
    with open("README.md", "w") as f:
        f.write(readme_content)
    
    api.upload_file(
        path_or_fileobj="README.md",
        path_in_repo="README.md",
        repo_id=repo_id,
        repo_type="dataset"
    )

# Usage
push_large_parquet_dataset(kept_document_dir, "neeleshg23/c4_curated")

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified 

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/neeleshg23/c4_curated/commit/main (Request ID: Root=1-672c2d20-4780009417faba6b461f20b2;fc616f2e-11bc-4cef-a909-308122ac590f)

You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.