In [1]:
import pandas as pd
import numpy as np
import os
import random

In [2]:
bakrep_metadata = pd.read_csv("data/bakrep-export_18112024.tsv",sep='\t', low_memory=False, index_col="#id")

In [3]:
[tup for tup in zip(bakrep_metadata.columns, bakrep_metadata.dtypes)]

[('bakta.genome.genus', dtype('O')),
 ('bakta.genome.species', dtype('O')),
 ('bakta.genome.strain', dtype('O')),
 ('bakta.stats.no_sequences', dtype('float64')),
 ('bakta.stats.size', dtype('float64')),
 ('bakta.stats.gc', dtype('float64')),
 ('bakta.stats.n_ratio', dtype('float64')),
 ('bakta.stats.n50', dtype('float64')),
 ('bakta.stats.coding_ratio', dtype('float64')),
 ('gtdbtk.classification.domain', dtype('O')),
 ('gtdbtk.classification.phylum', dtype('O')),
 ('gtdbtk.classification.class', dtype('O')),
 ('gtdbtk.classification.order', dtype('O')),
 ('gtdbtk.classification.family', dtype('O')),
 ('gtdbtk.classification.genus', dtype('O')),
 ('gtdbtk.classification.species', dtype('O')),
 ('mlst.sequence_type', dtype('O')),
 ('checkm2.quality.completeness', dtype('float64')),
 ('checkm2.quality.contamination', dtype('float64')),
 ('metadata.accession', dtype('O')),
 ('metadata.collected_by', dtype('O')),
 ('metadata.collection_date', dtype('O')),
 ('metadata.country', dtype('O'))

In [4]:
bakrep_metadata[["metadata.accession"]]

Unnamed: 0_level_0,metadata.accession
#id,Unnamed: 1_level_1
SAMD00000344,SAMD00000344
SAMD00000345,SAMD00000345
SAMD00000355,SAMD00000355
SAMD00000550,SAMD00000550
SAMD00000552,SAMD00000552
...,...
SAMN10405967,SAMN10405967
SAMN10405968,SAMN10405968
SAMN10407504,SAMN10407504
SAMN10407505,SAMN10407505


In [5]:
est_card = pd.read_csv('/Users/ktruong/data/661k/est_card_661k.csv', header = None, names= ['id','card'], index_col= 'id' )

In [6]:
est_card

Unnamed: 0_level_0,card
id,Unnamed: 1_level_1
SAMN02844126,4722564.00
SAMN04558116,1589918.00
SAMEA2068521,6482382.50
SAMN09281158,2937214.75
SAMN09843229,4991481.50
...,...
SAMEA3481361,2161193.00
SAMN09650548,5065128.50
SAMN09463236,5239581.00
SAMEA2534426,4334674.00


In [8]:
# missing 3 genomes
set(est_card.index.values) - set(bakrep_metadata.index.values)

{'SAMD00080514', 'SAMD00080519', 'SAMD00080520'}

In [9]:
species_groups =  bakrep_metadata.groupby(['gtdbtk.classification.species']).groups
group_sizes = bakrep_metadata.groupby(['gtdbtk.classification.species']).size()

In [10]:
group_sizes.sort_values()

gtdbtk.classification.species
1-14-0-20-39-49 sp002787635         1
Pantoea brenneri                    1
Pantoea anthophila                  1
Pantoea allii                       1
Pannonibacter phragmitetus          1
                                ...  
Staphylococcus aureus           47997
Mycobacterium tuberculosis      49020
Streptococcus pneumoniae        51433
Escherichia coli                89136
Salmonella enterica            178585
Length: 8207, dtype: int64

In [11]:
group_sizes_small = group_sizes[group_sizes>=2]
len(group_sizes_small)

2951

In [12]:
group_sizes_small.sort_values()

gtdbtk.classification.species
Erwinia rhapontici                 2
Halomonas caseinilytica            2
Pseudomonas_E sp003050925          2
Hallerella porci                   2
Hallerella intestinalis            2
                               ...  
Staphylococcus aureus          47997
Mycobacterium tuberculosis     49020
Streptococcus pneumoniae       51433
Escherichia coli               89136
Salmonella enterica           178585
Length: 2951, dtype: int64

In [13]:
group_sizes[(group_sizes>100) & (group_sizes<1000)].sort_values()

gtdbtk.classification.species
Enterobacter kobei              104
Bacillus_A paranthracis         105
Aeromonas veronii               106
Moraxella catarrhalis           107
Salmonella bongori              107
                               ... 
Mycobacterium intracellulare    880
Mycobacterium marinum           891
Bacillus_A anthracis            893
Neisseria lactamica             951
Salmonella houtenae             972
Length: 104, dtype: int64

In [14]:
group_sizes.sort_values(ascending=False).head(20)

gtdbtk.classification.species
Salmonella enterica           178585
Escherichia coli               89136
Streptococcus pneumoniae       51433
Mycobacterium tuberculosis     49020
Staphylococcus aureus          47997
Campylobacter_D jejuni         28336
Neisseria meningitidis         17153
Streptococcus pyogenes         16821
Klebsiella pneumoniae          13637
Clostridioides difficile       13579
Listeria monocytogenes         12245
Listeria monocytogenes_B       12244
Streptococcus agalactiae       10359
Neisseria gonorrhoeae           8904
Campylobacter_D coli            8849
Enterococcus_B faecium          8284
Pseudomonas aeruginosa          6286
Vibrio cholerae                 5562
Acinetobacter baumannii         5124
Mycobacterium abscessus         2689
dtype: int64

In [15]:
species_groups_bakta =  bakrep_metadata.groupby(['bakta.genome.species']).groups
group_sizes_bakta = bakrep_metadata.groupby(['bakta.genome.species']).size()

In [16]:
group_sizes_bakta.sort_values(ascending=False).head(5000)

bakta.genome.species
enterica         177020
coli              81894
pneumoniae        65831
tuberculosis      48945
aureus            48357
                  ...  
nanceiensis           1
namucuonensis         1
namhaensis            1
nakayamae             1
legallii              1
Length: 3915, dtype: int64

In [17]:
species_more_than_50 = group_sizes[group_sizes>=1].index

In [18]:
species_more_than_50

Index(['1-14-0-20-39-49 sp002787635', '1-14-0-20-46-22 sp002796245',
       '1162T-S-0a-05 sp000964245', '14-2 sp000403255', '14-2 sp000403315',
       '14-2 sp000403845', '1XD8-76 sp910573755',
       '2-02-FULL-42-43 sp002793195', '2013Ark19i sp001460935',
       '2013Ark19i sp900078745',
       ...
       'Zavarzinia compransoris', 'Zeaxanthinibacter enoshimensis',
       'Zhouia amylolytica', 'Zobellella_D taiwanensis', 'Zobellia uliginosa',
       'Zoogloea sp002028455', 'Zooshikella ganghwensis',
       'Zunongwangia mangrovi', 'Zunongwangia profunda', 'Zymomonas mobilis'],
      dtype='object', name='gtdbtk.classification.species', length=8207)

In [19]:
len(species_more_than_50)

8207

In [20]:
dict_sample_50 = {key:random.sample(list(species_groups[key].values),1) for key in species_more_than_50}

In [21]:
dict_sample_50

{'1-14-0-20-39-49 sp002787635': ['SAMN07622136'],
 '1-14-0-20-46-22 sp002796245': ['SAMN07622204'],
 '1162T-S-0a-05 sp000964245': ['SAMN02746023'],
 '14-2 sp000403255': ['SAMN01730990'],
 '14-2 sp000403315': ['SAMN01730994'],
 '14-2 sp000403845': ['SAMN01730989'],
 '1XD8-76 sp910573755': ['SAMEA80359918'],
 '2-02-FULL-42-43 sp002793195': ['SAMN07622190'],
 '2013Ark19i sp001460935': ['SAMEA2826832'],
 '2013Ark19i sp900078745': ['SAMEA2826840'],
 '49-20 sp002417685': ['SAMN06453970'],
 '79-D21 sp002432945': ['SAMN06453974'],
 'A37T11 sp900109895': ['SAMN05216436'],
 'AAA044-D11 sp000485495': ['SAMN02194296'],
 'AAA164-E04 sp000383715': ['SAMN02441633'],
 'AAA240-E13 sp000504605': ['SAMN02597171'],
 'AAA240-E13 sp000504625': ['SAMN02597172'],
 'AAA240-E13 sp000513055': ['SAMN02597280'],
 'AAA240-E13 sp000513075': ['SAMN02597281'],
 'AAA240-E13 sp003209225': ['SAMN08886570'],
 'AAA240-E13 sp003211695': ['SAMN08886571'],
 'AAA300-D14 sp002715065': ['SAMN02441642'],
 'AAA536-G10 sp000383115'

In [21]:
# Function to generate n groups with maximum diversity and a specified number of values per group
def generate_diverse_groups(data_dict, n, values_per_group):
    # Step 1: Flatten the dictionary into a list of (key, value) pairs
    flattened = [(key, value) for key, values in data_dict.items() for value in values]
    # Step 2: Shuffle the list randomly
    random.shuffle(flattened)
    
    # Step 3: Initialize the groups with empty lists and a dictionary to track key counts per group
    groups = [[] for _ in range(n)]
    group_keys = [set() for _ in range(n)]  # Tracks the keys in each group
    
    # Step 4: Distribute values into groups while keeping diversity
    for i, (key, value) in enumerate(flattened):
        # Try to find a group that doesn't already have a value from this key
        added = False
        
        # First, try to keep diversity as high as possible
        for group_index in range(n):
            if len(groups[group_index]) < values_per_group and key not in group_keys[group_index]:
                groups[group_index].append(value)
                group_keys[group_index].add(key)
                added = True
                break
        
        # If no group can be found with strict diversity, relax the constraint and add the value
        if not added:
            for group_index in range(n):
                if len(groups[group_index]) < values_per_group:
                    groups[group_index].append(value)
                    group_keys[group_index].add(key)  # Even if key is repeated, still add to track it
                    break
    
    # Step 5: Prepare the results by removing the keys from the tuples
    groups = [[value for value in group] for group in groups]

    # Step 6: Calculate and return the number of keys in each group
    key_counts = [len(keys) for keys in group_keys]

    return groups, key_counts

In [22]:
n = 1  # Number of groups to create
v = 8207
groups,species_count = generate_diverse_groups(dict_sample_50, n, v)

In [23]:
species_count,len(groups[0])

([8207], 8207)

In [24]:
[print(g) for g in groups]

['SAMN02256419', 'SAMN04493492', 'SAMN09083426', 'SAMEA104151675', 'SAMN00120084', 'SAMN02256424', 'SAMEA2248198', 'SAMN02256494', 'SAMN05444005', 'SAMN02927904', 'SAMN02745887', 'SAMN07621184', 'SAMEA3642880', 'SAMN06454619', 'SAMN05421834', 'SAMD00031992', 'SAMN05977996', 'SAMN02787076', 'SAMN03197717', 'SAMN02194853', 'SAMN06560375', 'SAMN03436093', 'SAMN05216562', 'SAMN02199321', 'SAMEA4551742', 'SAMN08772519', 'SAMEA102065668', 'SAMN04458671', 'SAMEA3642864', 'SAMN05444360', 'SAMN05192583', 'SAMN06473646', 'SAMN02440432', 'SAMEA3545396', 'SAMN05428962', 'SAMEA3642803', 'SAMN05366371', 'SAMEA1486436', 'SAMN00761793', 'SAMN04301676', 'SAMEA1024701', 'SAMN05192553', 'SAMEA1411912', 'SAMN07488290', 'SAMN07581354', 'SAMEA104148165', 'SAMN02261504', 'SAMD00012441', 'SAMN06754985', 'SAMN02584951', 'SAMN08885968', 'SAMN05421868', 'SAMN05428956', 'SAMN06454493', 'SAMN02199319', 'SAMN02194394', 'SAMN05421794', 'SAMN09011121', 'SAMN02194417', 'SAMN05428957', 'SAMN08775231', 'SAMN08886397', '

[None]

In [22]:
def write_list_to_file(filename, values):
    """
    Write a list of values to a text file, each value on a new line.
    
    :param filename: Path to the output file.
    :param values: List of values to write.
    """
    with open(filename, "w") as f:
        for value in values:
            f.write(f"{value}\n")

In [28]:
write_list_to_file('data/one_genome_per_species_661k.txt',groups[0])

In [26]:
for idx, values in enumerate(groups):
    path = 'data/batch_5000_species_sameple/'
    if (idx+1)<10:
        write_list_to_file(path+f'sample_0{idx+1}.txt',values)
    else:
        write_list_to_file(path+f'sample_{idx+1}.txt',values)

In [27]:
top19 = group_sizes.sort_values(ascending=False).head(19)

In [28]:
top19.index.values

array(['Salmonella enterica', 'Escherichia coli',
       'Streptococcus pneumoniae', 'Mycobacterium tuberculosis',
       'Staphylococcus aureus', 'Campylobacter_D jejuni',
       'Neisseria meningitidis', 'Streptococcus pyogenes',
       'Klebsiella pneumoniae', 'Clostridioides difficile',
       'Listeria monocytogenes', 'Listeria monocytogenes_B',
       'Streptococcus agalactiae', 'Neisseria gonorrhoeae',
       'Campylobacter_D coli', 'Enterococcus_B faecium',
       'Pseudomonas aeruginosa', 'Vibrio cholerae',
       'Acinetobacter baumannii'], dtype=object)

In [None]:
for species in top19.index.values:
    sample_genomes = random.sample(list(species_groups[species].values),5000)
    
    path = 'data/top_19_species_5000_genomes_sample/'

    write_list_to_file(path+f'genomes_5000_{species}.txt',sample_genomes)

In [30]:
species_groups.keys()

dict_keys(['1-14-0-20-39-49 sp002787635', '1-14-0-20-46-22 sp002796245', '1162T-S-0a-05 sp000964245', '14-2 sp000403255', '14-2 sp000403315', '14-2 sp000403845', '1XD8-76 sp910573755', '2-02-FULL-42-43 sp002793195', '2013Ark19i sp001460935', '2013Ark19i sp900078745', '49-20 sp002417685', '79-D21 sp002432945', 'A37T11 sp900109895', 'AAA044-D11 sp000485495', 'AAA164-E04 sp000383715', 'AAA240-E13 sp000504605', 'AAA240-E13 sp000504625', 'AAA240-E13 sp000513055', 'AAA240-E13 sp000513075', 'AAA240-E13 sp003209225', 'AAA240-E13 sp003211695', 'AAA300-D14 sp002715065', 'AAA536-G10 sp000383115', 'AAA536-G10 sp000384615', 'AAA536-G10 sp000421325', 'AAA536-G10 sp003278095', 'AAA536-G10 sp003282145', 'AAA536-G10 sp003282435', 'AAA536-G10 sp003282885', 'AAA536-G10 sp003283875', 'AAA536-G10 sp003284565', 'AB-137-C09 sp000379225', 'AEGEAN-183 sp012965075', 'AG-337-I02 sp003282155', 'AG-337-I02 sp902554185', 'AG-339-G14 sp003282105', 'AG-363-A16 sp003280345', 'AG-363-B04 sp003278465', 'AG-363-J23 sp003

In [31]:
sample_1_per_species = [species_groups[key][0] for key in species_groups.keys()]

In [32]:
len(sample_1_per_species)

8207

In [33]:
random.sample(sample_1_per_species,5000)

['SAMEA1411909',
 'SAMD00000605',
 'SAMN03436206',
 'SAMEA103957224',
 'SAMN02199108',
 'SAMEA3545292',
 'SAMEA3731268',
 'SAMN06264849',
 'SAMN05421720',
 'SAMN02441190',
 'SAMN06272739',
 'SAMN03996307',
 'SAMN05421737',
 'SAMN09232915',
 'SAMN07658433',
 'SAMEA2393091',
 'SAMN00203945',
 'SAMN02910264',
 'SAMD00000806',
 'SAMD00006368',
 'SAMN05216209',
 'SAMN05428981',
 'SAMEA4535846',
 'SAMEA3642798',
 'SAMN03266145',
 'SAMN03159505',
 'SAMN04324258',
 'SAMN03021531',
 'SAMN02199011',
 'SAMN04488070',
 'SAMN04198240',
 'SAMN03251816',
 'SAMN05421637',
 'SAMEA1710458',
 'SAMN02584946',
 'SAMN07621134',
 'SAMN02256531',
 'SAMN02198979',
 'SAMN06269173',
 'SAMEA2248198',
 'SAMEA3545300',
 'SAMN03267909',
 'SAMN02787161',
 'SAMN02261301',
 'SAMN05421540',
 'SAMN05421663',
 'SAMN08886134',
 'SAMN02441472',
 'SAMN04218346',
 'SAMN05216304',
 'SAMN05880561',
 'SAMN08885950',
 'SAMEA3643094',
 'SAMN02194897',
 'SAMN02194206',
 'SAMN02745834',
 'SAMEA4028999',
 'SAMN02746065',
 'SAMEA17089

In [34]:
write_list_to_file('data/batch_5000_species_sameple/data.txt',sample_1_per_species)

In [35]:
sal_enterica_10000 = random.sample(list(species_groups['Salmonella enterica']),10000)

In [36]:
write_list_to_file('data/batch_5000_species_sameple/sal_ente_10000.txt',sal_enterica_10000)

In [37]:
#create batch for correlation:
genomes_list_1 = [species_groups[key][0] for key in species_groups.keys()]
genomes_list_2 = [species_groups[key][1] for key in species_groups.keys() if key in group_sizes[group_sizes>1].index]
genomes_list_3 = [species_groups[key][2] for key in species_groups.keys() if key in group_sizes[group_sizes>2].index]
genomes_list_4 = [species_groups[key][3] for key in species_groups.keys() if key in group_sizes[group_sizes>3].index]
genomes_list_5 = [species_groups[key][4] for key in species_groups.keys() if key in group_sizes[group_sizes>4].index]
genomes_list_6 = [species_groups[key][5] for key in species_groups.keys() if key in group_sizes[group_sizes>5].index]

In [38]:
mix_15000_genome = genomes_list_1+genomes_list_2+genomes_list_3+genomes_list_4+genomes_list_5+genomes_list_6
random.shuffle(mix_15000_genome)
(mix_15000_genome)

['SAMEA104113837',
 'SAMN05216354',
 'SAMN04487959',
 'SAMEA1710541',
 'SAMEA102345418',
 'SAMN02745944',
 'SAMN02198942',
 'SAMEA103957245',
 'SAMD00079845',
 'SAMEA859929',
 'SAMN02584957',
 'SAMN09074965',
 'SAMN06272746',
 'SAMN03266142',
 'SAMN09399244',
 'SAMN02441543',
 'SAMEA3107686',
 'SAMN08779462',
 'SAMEA1929556',
 'SAMEA3545423',
 'SAMN05230164',
 'SAMEA104138651',
 'SAMN09074860',
 'SAMN02923856',
 'SAMD00090153',
 'SAMEA1569200',
 'SAMEA3545391',
 'SAMEA104141662',
 'SAMN05443377',
 'SAMN09932492',
 'SAMN07327721',
 'SAMEA29746918',
 'SAMD00016710',
 'SAMN07621369',
 'SAMN06269111',
 'SAMN08458246',
 'SAMEA3643052',
 'SAMN07573570',
 'SAMN06165950',
 'SAMEA4040988',
 'SAMN06473673',
 'SAMEA1710555',
 'SAMN02910325',
 'SAMN06452493',
 'SAMN05443144',
 'SAMN00771411',
 'SAMN06767131',
 'SAMN07163118',
 'SAMN04487998',
 'SAMN08458166',
 'SAMD00111862',
 'SAMN04457385',
 'SAMN05428957',
 'SAMN07621075',
 'SAMN02745181',
 'SAMN07710589',
 'SAMN08886168',
 'SAMEA1027305',
 'SA

In [39]:
def split_wit_growing_size(items,begin_size,jump_length):
    result = []
    index = 0
    size = begin_size
    while index+size<len(items):
        result.append(items[index:index+size])
        index += size
        size += jump_length
    return result

In [40]:
lists = split_wit_growing_size(mix_15000_genome,500,25)

In [41]:
len(lists)

21

In [42]:
for l in lists:
    print(len(l))

500
525
550
575
600
625
650
675
700
725
750
775
800
825
850
875
900
925
950
975
1000


In [43]:
if not os.path.exists('data/random_sample_varies_number_of_genome/'):
    os.mkdir('data/random_sample_varies_number_of_genome/')
    
for idx, values in enumerate(lists):
    path = 'data/random_sample_varies_number_of_genome/'
    if (idx+1)<10:
        write_list_to_file(path+f'sample_0{idx+1}.txt',values)
    else:
        write_list_to_file(path+f'sample_{idx+1}.txt',values)

In [44]:
sal_enterica = list(species_groups['Salmonella enterica'])
random.shuffle(sal_enterica)

In [45]:
len(sal_enterica)

178585

In [46]:
sal_lists = split_wit_growing_size(sal_enterica,1000,250)

In [47]:
print(len(sal_lists))
for l in sal_lists[:20]:
    print(len(l))

34
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750


In [49]:
if not os.path.exists('data/sal_enterica_growing_size/'):
    os.mkdir('data/sal_enterica_growing_size/')
    
for idx, values in enumerate(sal_lists[:20]):
    path = 'data/sal_enterica_growing_size/'
    if (idx+1)<10:
        write_list_to_file(path+f'sample_0{idx+1}.txt',values)
    else:
        write_list_to_file(path+f'sample_{idx+1}.txt',values)

In [50]:
species_groups['Salmonella enterica']

Index(['SAMD00002684', 'SAMD00003501', 'SAMD00003784', 'SAMD00003785',
       'SAMD00003786', 'SAMD00003787', 'SAMD00003788', 'SAMD00003789',
       'SAMD00003790', 'SAMD00003791',
       ...
       'SAMN10405214', 'SAMN10405215', 'SAMN10405960', 'SAMN10405961',
       'SAMN10405962', 'SAMN10405964', 'SAMN10405965', 'SAMN10405966',
       'SAMN10405967', 'SAMN10405968'],
      dtype='object', name='#id', length=178585)

In [51]:
species_more_than_10000 = group_sizes[group_sizes>=10000].index

In [52]:
species_more_than_10000

Index(['Campylobacter_D jejuni', 'Clostridioides difficile',
       'Escherichia coli', 'Klebsiella pneumoniae', 'Listeria monocytogenes',
       'Listeria monocytogenes_B', 'Mycobacterium tuberculosis',
       'Neisseria meningitidis', 'Salmonella enterica',
       'Staphylococcus aureus', 'Streptococcus agalactiae',
       'Streptococcus pneumoniae', 'Streptococcus pyogenes'],
      dtype='object', name='gtdbtk.classification.species')

In [None]:
if not os.path.exists('data/dataset_10000_genomes/'):
    os.mkdir('data/dataset_10000_genomes/')
for species in species_more_than_10000:
    path = 'data/dataset_10000_genomes/'
    values = random.sample(list(species_groups[species]),10000)
    write_list_to_file(path+f'file_{species.replace(" ","_")}.txt',values)    
    

In [18]:
species_more_than_10000 = group_sizes[group_sizes>=10000]
species_more_than_10000.sum()/662000

0.8165332326283988

In [19]:
species_more_than_10000.sum()

540545

In [26]:
species_more_than_20000 = group_sizes[group_sizes>=20000]
species_more_than_20000, species_more_than_20000.sum(), species_more_than_20000.sum()/bakrep_metadata.shape[0]

(gtdbtk.classification.species
 Campylobacter_D jejuni         28336
 Escherichia coli               89136
 Mycobacterium tuberculosis     49020
 Salmonella enterica           178585
 Staphylococcus aureus          47997
 Streptococcus pneumoniae       51433
 dtype: int64,
 444507,
 0.6720678195711534)

In [25]:
if not os.path.exists('data/dataset_20000_genomes/'):
    os.mkdir('data/dataset_20000_genomes/')
for species in species_more_than_20000.index:
    path = 'data/dataset_20000_genomes/'
    values = random.sample(list(species_groups[species]),20000)
    write_list_to_file(path+f'file.{species.replace(" ","_")}.txt',values)    

In [22]:
if not os.path.exists('data/dataset_5000_genomes/'):
    os.mkdir('data/dataset_5000_genomes/')
for species in species_more_than_20000.index:
    path = 'data/dataset_5000_genomes/'
    values = random.sample(list(species_groups[species]),5000)
    write_list_to_file(path+f'file.{species.replace(" ","_")}.txt',values)    

In [26]:
species_more_than_100k = group_sizes[group_sizes>=100000]
species_more_than_100k.sum()/662000

0.26976586102719036

In [30]:
species_more_than_100k.index[0]

'Salmonella enterica'

In [32]:
values = random.sample(list(species_groups[species_more_than_100k.index[0]]),100000)

In [34]:
write_list_to_file('data/'+f'file.salmonella_enterica_100k.txt',values)    

In [30]:
E_coli_all = list(species_groups["Escherichia coli"])
path = "data/e_coli/"
if not os.path.exists(path):
    os.mkdir(path)
write_list_to_file(path+f'file.e_coli_all.txt',E_coli_all)

In [43]:
staph_aureus = list(species_groups["Staphylococcus aureus"])
path = "data/staph_aureus/"
if not os.path.exists(path):
    os.mkdir(path)
write_list_to_file(path+f'file.staph_aureus_all.txt',staph_aureus)

In [29]:
for species in species_more_than_20000.index:
    path = 'data/'
    values = list(species_groups[species])
    write_list_to_file(path+f'file.{species.replace(" ","_")}.txt',values)    

In [53]:
# 64233 randomly sampled genomes
values = random.sample(list(bakrep_metadata.index),64233)
write_list_to_file('data/random_64233.txt',values)

In [54]:
values

['SAMN04458879',
 'SAMN03074784',
 'SAMEA1324123',
 'SAMEA1920265',
 'SAMN09218590',
 'SAMEA1033300',
 'SAMEA2743628',
 'SAMEA1559719',
 'SAMEA2275678',
 'SAMN07147054',
 'SAMEA3929272',
 'SAMEA3354051',
 'SAMEA3217937',
 'SAMN05289950',
 'SAMN07695305',
 'SAMN09851899',
 'SAMN08795234',
 'SAMN06437656',
 'SAMN08773088',
 'SAMN00631846',
 'SAMN08815356',
 'SAMN00991432',
 'SAMN06031866',
 'SAMN09060470',
 'SAMEA1117601',
 'SAMEA104354215',
 'SAMEA104416834',
 'SAMN07501555',
 'SAMN03276143',
 'SAMN09273304',
 'SAMN10353479',
 'SAMEA2146850',
 'SAMEA3481594',
 'SAMN09521886',
 'SAMEA2154836',
 'SAMEA104093530',
 'SAMEA104162152',
 'SAMN00811189',
 'SAMN03479670',
 'SAMEA4560352',
 'SAMN07682363',
 'SAMN08795292',
 'SAMEA954624',
 'SAMEA2340688',
 'SAMN02403782',
 'SAMEA3381399',
 'SAMN08518297',
 'SAMEA2041227',
 'SAMEA3293008',
 'SAMEA3946980',
 'SAMN10134243',
 'SAMEA806645',
 'SAMEA3531667',
 'SAMN07136386',
 'SAMEA4429021',
 'SAMEA2821555',
 'SAMEA2238161',
 'SAMEA102418168',
 'SAMN

In [58]:
set_species= set([])
for gen in values:
    set_species.add(bakrep_metadata.loc[gen]['gtdbtk.classification.species'])

In [60]:
len(set_species)

1677