In [24]:
import os

repo_id = "Open-Orca/FLAN"
subfolders = ["flan_fsnoopt_data", "flan_fsopt_data", "flan_zsnoopt_data", "flan_zsopt_data"]
chose_paraquets_count = [18,42,8,9]
local_base_path = "datasets"

# Select the dataset
dataset_index = 2
dataset_name = subfolders[dataset_index]
selected_count = chose_paraquets_count[dataset_index]

folder_path = os.path.join(local_base_path, dataset_name)
dataset_prefix = dataset_name.split("_")[0]

filtered_datasets_folder = "filtered_datasets"
filtered_folder_path = os.path.join(filtered_datasets_folder, dataset_name)

In [10]:
# Get hugging face file names

from huggingface_hub import HfApi


api = HfApi()

def get_subfolder_files(repo_id, subfolders):
    repo_files = api.list_repo_files(repo_id, repo_type="dataset")
    return [file for file in repo_files if any(file.startswith(subfolder + "/") for subfolder in subfolders)]

subfolder_files = get_subfolder_files(repo_id, subfolders)
len(subfolder_files), subfolder_files

(721,
 ['flan_fsnoopt_data/part.0.parquet',
  'flan_fsnoopt_data/part.1.parquet',
  'flan_fsnoopt_data/part.10.parquet',
  'flan_fsnoopt_data/part.100.parquet',
  'flan_fsnoopt_data/part.101.parquet',
  'flan_fsnoopt_data/part.102.parquet',
  'flan_fsnoopt_data/part.103.parquet',
  'flan_fsnoopt_data/part.104.parquet',
  'flan_fsnoopt_data/part.105.parquet',
  'flan_fsnoopt_data/part.106.parquet',
  'flan_fsnoopt_data/part.107.parquet',
  'flan_fsnoopt_data/part.108.parquet',
  'flan_fsnoopt_data/part.109.parquet',
  'flan_fsnoopt_data/part.11.parquet',
  'flan_fsnoopt_data/part.110.parquet',
  'flan_fsnoopt_data/part.111.parquet',
  'flan_fsnoopt_data/part.112.parquet',
  'flan_fsnoopt_data/part.113.parquet',
  'flan_fsnoopt_data/part.114.parquet',
  'flan_fsnoopt_data/part.115.parquet',
  'flan_fsnoopt_data/part.116.parquet',
  'flan_fsnoopt_data/part.117.parquet',
  'flan_fsnoopt_data/part.118.parquet',
  'flan_fsnoopt_data/part.119.parquet',
  'flan_fsnoopt_data/part.12.parquet',
 

In [None]:
# Download paraquet files

import concurrent.futures

def download_file(api, repo_id, file, local_base_path):
    api.hf_hub_download(repo_id=repo_id, repo_type="dataset", 
                                         filename=file, local_dir=local_base_path)

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(download_file, api, repo_id, file, local_base_path) 
                   for file in subfolder_files]
        
        concurrent.futures.wait(futures)

In [25]:
# Check all files are downloaded
def get_parquet_files_by_folder(folder_path):
    items = os.listdir(folder_path)
    return [item for item in items if item.endswith('.parquet') and os.path.isfile(os.path.join(folder_path, item))]

parquet_files = get_parquet_files_by_folder(folder_path)
origin_file_count = len(get_subfolder_files(repo_id, [dataset_name]))
print(f"Total files '{folder_path}': {len(parquet_files)} out of {origin_file_count}")

Total files 'datasets/flan_zsnoopt_data': 78 out of 78


In [26]:
# Gather filtered tasks
import os
import pandas as pd
import numpy as np

column = "_task_name"

skipping_tasks = [
    "wmt16_translate/cs-en:1.0.0",
    "wmt14_translate/fr-en:1.0.0",
    "wmt16_translate/de-en:1.0.0",
    "wmt16_translate/ru-en:1.0.0",
    "wmt16_translate/fi-en:1.0.0",
    "wmt16_translate/ro-en:1.0.0",
    "wmt16_translate/tr-en:1.0.0",
    "word_segment"
]
    
os.makedirs(filtered_folder_path, exist_ok=True)

# Single File Check
# file = f"part.{0}.parquet"
# print(file)
# file_path = os.path.join(folder_path, file)
# df = pd.read_parquet(file_path)
# fldf = df[~df[column].isin(skipping_tasks)]
# len(df), len(fldf)

for i in range(selected_count):
    file = f"part.{i}.parquet"
    print(file)
    file_path = os.path.join(folder_path, file)
    df = pd.read_parquet(file_path)
    fldf = df[~df[column].isin(skipping_tasks)]

    filtered_file_path= os.path.join(filtered_folder_path, f"{i}.parquet")
    fldf.to_parquet(filtered_file_path)

part.0.parquet


part.1.parquet
part.2.parquet
part.3.parquet
part.4.parquet
part.5.parquet
part.6.parquet
part.7.parquet


In [27]:
import pandas as pd

df_list = []

for i in range(selected_count):
    file = f"{i}.parquet"
    file_path = os.path.join(filtered_folder_path, file)
    df = pd.read_parquet(file_path)
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True).reset_index(drop=True)
combined_df

Unnamed: 0,inputs,targets,_template_idx,_task_source,_task_name,_template_type
0,"Is it possible to conclude that ""A little girl...",yes,7,Flan2021,snli:1.1.0,zs_noopt
1,Math problem: Solve -24*y - 1654*y = -218*y - ...,91,6,Flan2021,math_dataset/algebra__linear_1d:1.0.0,zs_noopt
2,Coyote\n\nThe coyote (Canis latrans); from Nah...,no,8,Flan2021,bool_q:1.0.0,zs_noopt
3,Solve this math problem\n\nSolve 948 = 205*u -...,9,1,Flan2021,math_dataset/algebra__linear_1d:1.0.0,zs_noopt
4,Write a positive tweet.,Making sausage &amp; peppers for dinner. Mmmmm...,8,Flan2021,sentiment140:1.0.0,zs_noopt
...,...,...,...,...,...,...
1046387,"Write a sentence based on ""ukrainian prosecuto...",ukraine 's top prosecutor 's office said frida...,9,Flan2021,gigaword:1.2.0,zs_noopt
1046388,Write a positive tweet.,"Jonas Brothers! Lines, Vines, And Trying Time!...",8,Flan2021,sentiment140:1.0.0,zs_noopt
1046389,News article: The Mad Men Effect\n\nBetty Drap...,Best Dressed Show Betty Draper: style icon of ...,6,Flan2021,newsroom:1.0.0,zs_noopt
1046390,"Victor Hugo, who stood among the crowd, dutifu...",no,9,Flan2021,glue/mnli:2.0.0,zs_noopt


In [3]:
# # Push by Chunk

# import pandas as pd
# from datasets import Dataset
# import numpy as np

# # Define a function to upload the dataset in chunks
# def upload_in_chunks(df, chunk_size, repo_id, dataset_name):
#     for i in range(0, len(df), chunk_size):
#         chunk = df.iloc[i:i+chunk_size]
#         dataset = Dataset.from_pandas(chunk)
#         split_name = f"{dataset_name}_chunk_{i//chunk_size}"
#         dataset.push_to_hub(repo_id, config_name=dataset_name, split=split_name)
#         print(f"Uploaded chunk {i//chunk_size}")

# # Define chunk size, repo ID, and dataset name
# number_of_chunks = 8
# chunk_size = int(np.ceil(len(combined_df) / number_of_chunks))
# print(chunk_size)
# repo_id = "0xAIT/flan-subset"

# # Upload the DataFrame in chunks
# upload_in_chunks(combined_df, chunk_size, repo_id, dataset_name)

450457


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/226 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/226 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Uploaded chunk 0


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/226 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/226 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Uploaded chunk 1


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/226 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/226 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.00k [00:00<?, ?B/s]

Uploaded chunk 2


: 

In [5]:
# # Complete Push

# from datasets import Dataset
# import pandas as pd

# subset_repo_id = "0xAIT/flan-subset"
# dataset = Dataset.from_pandas(combined_df)

: 

# Upload as a folder

In [1]:
# import os
# from datasets import Dataset
# import pandas as pd

# subfolders = ["flan_fsnoopt_data", "flan_fsopt_data", "flan_zsnoopt_data", "flan_zsopt_data"]
# chose_paraquets_count = [18,42,8,9]
# dataset_index = 1

# dataset_name = subfolders[dataset_index]
# selected_count = chose_paraquets_count[dataset_index]

# filtered_datasets_folder = "filtered_datasets"
# filtered_folder_path = os.path.join(filtered_datasets_folder, dataset_name)

# subset_repo_id = "0xAIT/flan-subset"

# df_list = []

# for i in range(selected_count):
#     file = f"{i}.parquet"
#     print(file)
#     file_path = os.path.join(filtered_folder_path, file)
#     df = pd.read_parquet(file_path)
#     df_list.append(df)

# combined_df = pd.concat(df_list, ignore_index=True).reset_index(drop=True)
# combined_df

0.parquet
1.parquet
2.parquet
3.parquet
4.parquet
5.parquet
6.parquet
7.parquet
8.parquet
9.parquet
10.parquet
11.parquet
12.parquet
13.parquet
14.parquet
15.parquet
16.parquet
17.parquet
18.parquet
19.parquet
20.parquet
21.parquet
22.parquet
23.parquet
24.parquet
25.parquet
26.parquet
27.parquet
28.parquet
29.parquet
30.parquet
31.parquet
32.parquet
33.parquet
34.parquet
35.parquet
36.parquet
37.parquet
38.parquet
39.parquet
40.parquet
41.parquet


Unnamed: 0,inputs,targets,_template_idx,_task_source,_task_name,_template_type
0,Question:\nSummarize this: australia on monday...,s. africa 's mbeki defends china-africa relations,1,Flan2021,gigaword:1.2.0,fs_opt
1,Question:\nSolve -13*s + 4151 = 3982 for s..\n...,-2,4,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_opt
2,"Problem: Wilhelmus Simon Petrus Fortuijn, know...",He appeared as Charley in an Encores! staged c...,4,Flan2021,quac:1.0.0,fs_opt
3,Q: What is the solution?\nSolve 23 = 9*m - 13 ...,-50,8,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_opt
4,Question:\nWrite a tweet that is positive (OPT...,--- just got back from State....work at 7,0,Flan2021,sentiment140:1.0.0,fs_opt
...,...,...,...,...,...,...
1801822,question: Teacher asked me this: Solve -28*q +...,58,5,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_opt
1801823,QUESTION: Math problem: Solve 22*a - 43 = -153...,5,7,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_opt
1801824,"Concepts: come, engine, station, track\nA: A s...",sun breaking through a cloud,3,Flan2021,gem/common_gen:1.1.0,fs_opt
1801825,"Question: Write a text based on ""india promise...",yugoslavia called monday on the u.n. security ...,8,Flan2021,gigaword:1.2.0,fs_opt


In [13]:
combined_df

Unnamed: 0,inputs,targets,_template_idx,_task_source,_task_name,_template_type
0,input question: Write a random tweet?\n\nA ran...,"@xxnissa Well, I truly love her, but someday I...",6,Flan2021,sentiment140:1.0.0,fs_noopt
1,Problem: uganda 's main opposition leader is k...,chinese arctic research highlights global clim...,6,Flan2021,gigaword:1.2.0,fs_noopt
2,Solve -167*m + 5725 = 1383 for m.\nSolve this ...,-34,6,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_noopt
3,"Premise & hypothesis: Is the premise ""Two litt...",yes,5,Flan2021,snli:1.1.0,fs_noopt
4,Question: What is the solution?\n\nSolve -65*d...,0,2,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_noopt
...,...,...,...,...,...,...
985404,Problem: Math Problem\nSolve -17*i = -353*i + ...,-2,3,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_noopt
985405,input question: Write a random tweet?\n\nA ran...,Just booked tickets to go to Brisbane this wee...,6,Flan2021,sentiment140:1.0.0,fs_noopt
985406,Problem: the armed basque separatist group eta...,china develops anti-terrorism training system,6,Flan2021,gigaword:1.2.0,fs_noopt
985407,mtv announced monday it will move ahead with p...,candle vigils and beds of flowers as stockholm...,0,Flan2021,gigaword:1.2.0,fs_noopt


In [28]:
dataset_name

'flan_zsnoopt_data'

In [29]:
# Chunking the filtered dataset

import numpy as np

output_dir = f"chunks/{dataset_name}"
os.makedirs(output_dir, exist_ok=True)

number_of_chunks = 3
rows_per_chunk = int(np.ceil(len(combined_df) / number_of_chunks))

for i in range(number_of_chunks):
    start = i * rows_per_chunk
    end = min((i + 1) * rows_per_chunk, len(combined_df))
    chunk_df = combined_df.iloc[start:end]
    chunk_df.to_parquet(f"{output_dir}/part.{i}.parquet")

    print(f"Chunk {i}: {len(chunk_df)}")

Chunk 0: 348798
Chunk 1: 348798
Chunk 2: 348796


In [6]:
# chunk_files = [f"part.{i}.parquet" for i in range(number_of_chunks)]

# chunk_path = f"{output_dir}/{chunk_files[0]}"
# chunk_df = pd.read_parquet(chunk_path)
# chunk_df

Unnamed: 0,inputs,targets,_template_idx,_task_source,_task_name,_template_type
0,Question:\nSummarize this: australia on monday...,s. africa 's mbeki defends china-africa relations,1,Flan2021,gigaword:1.2.0,fs_opt
1,Question:\nSolve -13*s + 4151 = 3982 for s..\n...,-2,4,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_opt
2,"Problem: Wilhelmus Simon Petrus Fortuijn, know...",He appeared as Charley in an Encores! staged c...,4,Flan2021,quac:1.0.0,fs_opt
3,Q: What is the solution?\nSolve 23 = 9*m - 13 ...,-50,8,Flan2021,math_dataset/algebra__linear_1d:1.0.0,fs_opt
4,Question:\nWrite a tweet that is positive (OPT...,--- just got back from State....work at 7,0,Flan2021,sentiment140:1.0.0,fs_opt
...,...,...,...,...,...,...
225224,Text: sudan is ready to send an ambassador to ...,saramago s body taken to portugal for funeral,2,Flan2021,gigaword:1.2.0,fs_opt
225225,"Problem: Article: Adrian Brown, of Brockley Ri...",[[Northern Ireland boss Michael O'Neill is con...,2,Flan2021,huggingface:xsum,fs_opt
225226,Problem: manchester united manager sir alex fe...,top powers meet iran on nukes,6,Flan2021,gigaword:1.2.0,fs_opt
225227,"Question: Write a text based on ""compromise se...","the task force on it manpower , chaired by sec...",8,Flan2021,gigaword:1.2.0,fs_opt


In [30]:
from datasets import Dataset
from huggingface_hub import HfApi
import os

api = HfApi()

subset_repo_id = "0xAIT/flan-subset"
api.upload_folder(folder_path=output_dir, repo_id=subset_repo_id, repo_type="dataset", path_in_repo=dataset_name)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

part.2.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

part.0.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

part.1.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/0xAIT/flan-subset/commit/f73a1c129051836f52aa6882aa29037f4746e0a1', commit_message='Upload folder using huggingface_hub', commit_description='', oid='f73a1c129051836f52aa6882aa29037f4746e0a1', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
# total_rows = 0


# for i in range(5):
#     file = f"{i}.parquet"
#     file_path = os.path.join(filtered_folder_path, file)
#     df = pd.read_parquet(file_path)

#     first_value = df_list[i].iloc[0]
#     real_value = combined_df.iloc[total_rows]
#     total_rows += len(df)
#     assert first_value.equals(real_value)

#     new_df = {
#         "original": first_value["inputs"][:30],
#         "filtered": real_value["inputs"][:30]
#     }
#     print(new_df)



{'original': 'input question: Write a random', 'filtered': 'input question: Write a random'}
{'original': "Question: i'm always afraid th", 'filtered': "Question: i'm always afraid th"}
{'original': 'What should I do to concentrat', 'filtered': 'What should I do to concentrat'}
{'original': 'Solve -132*z - 2310 = 22*z for', 'filtered': 'Solve -132*z - 2310 = 22*z for'}
{'original': 'Q: Write a negative tweet.\n\nA:', 'filtered': 'Q: Write a negative tweet.\n\nA:'}


In [38]:
# # Save Tasks Names
# import os
# import pandas as pd
# import numpy as np
# from multiprocessing import Pool, cpu_count

# column = "_task_name"
# subfolder_file_prefix = "part"

# def get_unique_values(file_path):
#     df = pd.read_parquet(file_path)
#     unique_values = df[column].unique()
#     return unique_values.tolist()


# unique_values = []

# for i in range(origin_file_count):
#     file = f"{subfolder_file_prefix}.{i}.parquet"
#     print(file)
#     file_path = os.path.join(folder_path, file)
#     unique_values.extend(get_unique_values(file_path))


# values = list(set(unique_values))

# outputpath = "tasks/t0.txt"
# with open(outputpath, "w") as f:
#     for value in values:
#         f.write(f"{value}\n")