In [1]:
import pandas as pd

In [11]:
count_data = pd.read_csv('/Users/matteo/Documents/MATLAS/data/count_data.tsv', sep='\t', index_col=0)
count_data.set_index(count_data.columns[0],inplace=True)

In [14]:
# Create OTU to integer mapping with PAD token
otu_to_int = {'PAD': 0}  # Start with PAD token
otu_to_int.update({otu: idx + 1 for idx, otu in enumerate(count_data.columns)})

# Save the dictionary
import json
with open('otu_mapping.json', 'w') as f:
    json.dump(otu_to_int, f)

# Create new dataframe with arrays of integers
result_df = pd.DataFrame(index=count_data.index)
result_df['otu_arrays'] = count_data.apply(
    lambda row: [otu_to_int[col] for col, val in row.items() if val > 0], 
    axis=1
)

In [17]:
# Save to h5 file
result_df.to_hdf('sample_otu_arrays.h5', key='df', mode='w')

# To verify we can load it back:
loaded_df = pd.read_hdf('sample_otu_arrays.h5', key='df')
print(loaded_df.head())

                                                            otu_arrays
Unnamed: 0                                                            
SRR044623.SRS018995  [5, 19, 22, 24, 27, 30, 43, 45, 52, 56, 58, 65...
SRR046457.SRS044477  [8, 84, 111, 125, 168, 177, 184, 220, 225, 235...
SRR043598.SRS014279  [19, 22, 23, 24, 30, 36, 45, 52, 58, 65, 70, 7...
SRR041301.SRS024633  [19, 22, 30, 46, 52, 58, 65, 70, 71, 74, 80, 8...
SRR328613.SRS062975  [8, 29, 53, 84, 121, 125, 134, 135, 139, 176, ...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['otu_arrays'], dtype='object')]

  result_df.to_hdf('sample_otu_arrays.h5', key='df', mode='w')


In [18]:
# Verify a sample
sample_name = result_df.index[0]
print(f"Original array for {sample_name}:", result_df.loc[sample_name, 'otu_arrays'])
print(f"Loaded array for {sample_name}:", loaded_df.loc[sample_name, 'otu_arrays'])

Original array for SRR044623.SRS018995: [5, 19, 22, 24, 27, 30, 43, 45, 52, 56, 58, 65, 66, 68, 71, 74, 76, 80, 82, 83, 85, 90, 92, 93, 98, 99, 104, 105, 116, 117, 121, 125, 126, 129, 131, 132, 136, 140, 142, 147, 150, 155, 158, 162, 167, 174, 175, 177, 182, 185, 186, 192, 193, 194, 198, 199, 201, 203, 204, 205, 206, 207, 209, 210, 213, 216, 218, 223, 226, 230, 234, 235, 243, 244, 246, 248, 258, 259, 263, 267, 268, 269, 270, 271, 273, 278, 285, 286, 290, 291, 292, 300, 303, 308, 309, 314, 317, 320, 321, 331, 332, 333, 336, 344, 346, 350, 352, 357, 364, 369, 372, 379, 382, 385, 389, 392, 393, 396, 400, 403, 408, 411, 417, 435, 438, 444, 456, 457, 459, 488, 495, 504, 507]
Loaded array for SRR044623.SRS018995: [5, 19, 22, 24, 27, 30, 43, 45, 52, 56, 58, 65, 66, 68, 71, 74, 76, 80, 82, 83, 85, 90, 92, 93, 98, 99, 104, 105, 116, 117, 121, 125, 126, 129, 131, 132, 136, 140, 142, 147, 150, 155, 158, 162, 167, 174, 175, 177, 182, 185, 186, 192, 193, 194, 198, 199, 201, 203, 204, 205, 206, 207,