In [1]:
import pyarrow as pa
import pyarrow.feather as feather
import pyarrow.parquet as pq
import pyarrow.compute as pc
import numpy as np
from pyarrow import csv
from pathlib import Path
from rich.progress import track

In [2]:
!ls -l /data/aiomics/massspec_cache/uniprot/cho/*.parquet
!ls -l /data/aiomics/massspec_cache/uniprot/cho/*.arrow

'ls' is not recognized as an internal or external command,
operable program or batch file.
'ls' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
for input in track(Path('/data/search_asms2023/cho').glob('*.arrow')):
    output = input.with_suffix('.parquet')
    table = feather.read_table(input)
    pq.write_table(table, output)

In [None]:
table = pq.read_table("/data/aiomics/massspec_cache/uniprot/cho/predicted_cho_uniprot_tryptic_2_0.parquet")
table

In [2]:
query_table = pq.read_table("/home/djs10/asms2023/test.parquet")
query_table

pyarrow.Table
id: uint64
charge: int8
ev: double
instrument: string
instrument_type: string
instrument_model: string
ion_mode: string
ionization: string
name: string
synonyms: string
scan: string
nce: double
collision_energy: double
retention_time: double
collision_gas: string
insource_voltage: int64
sample_inlet: string
intensity: large_list<item: double>
  child 0, item: double
stddev: large_list<item: double>
  child 0, item: double
product_massinfo: struct<tolerance: double, tolerance_type: dictionary<values=string, indices=int32, ordered=0>, mass_type: dictionary<values=string, indices=int32, ordered=0>, neutral_loss: string, neutral_loss_charge: int64, evenly_spaced: bool>
  child 0, tolerance: double
  child 1, tolerance_type: dictionary<values=string, indices=int32, ordered=0>
  child 2, mass_type: dictionary<values=string, indices=int32, ordered=0>
  child 3, neutral_loss: string
  child 4, neutral_loss_charge: int64
  child 5, evenly_spaced: bool
mz: large_list<item: double>


In [4]:
len(query_table)

143044

In [3]:
mods = query_table["mod_names"]
np_mods = mods.combine_chunks().to_numpy(zero_copy_only=False)
mask = pa.array(~np.array([ np.any(x==737) for x in np_mods ]))
no_tmt_table = query_table.filter(mask)
len(no_tmt_table)

116548

In [12]:
cho_names = csv.read_csv('/home/djs10/asms2023/cho_names_uniq.txt',read_options=csv.ReadOptions(column_names=['name']), parse_options=csv.ParseOptions(delimiter='\t'))
len(cho_names)

110374

In [18]:
cho_set = set(cho_names['name'].to_pylist())
cho_mask = [x in cho_set for x in no_tmt_table['name'].to_pylist()]
print(cho_mask.count(True),cho_mask.count(False))
cho_table = no_tmt_table.filter(cho_mask)
len(cho_table)

8819 107729


8819

In [19]:

pq.write_table(cho_table,"/home/djs10/asms2023/test_filtered.parquet")

In [3]:
!dir d:\nist\asms2023\library\*.arrow

 Volume in drive D is Data
 Volume Serial Number is 3E64-D3CD

 Directory of d:\nist\asms2023\library

04/25/2023  08:27 PM    29,249,339,394 predicted_cho_uniprot_tryptic_2_0.arrow
04/25/2023  09:10 PM    29,769,728,626 predicted_cho_uniprot_tryptic_2_1.arrow
04/25/2023  08:26 PM    30,145,136,186 predicted_cho_uniprot_tryptic_2_2.arrow
04/25/2023  08:24 PM    29,257,910,130 predicted_cho_uniprot_tryptic_2_3.arrow
04/25/2023  08:00 PM    29,026,097,490 predicted_cho_uniprot_tryptic_2_4.arrow
04/25/2023  12:27 PM    21,739,170,266 predicted_cho_uniprot_tryptic_2_5.arrow
               6 File(s) 169,187,382,092 bytes
               0 Dir(s)  1,764,818,427,904 bytes free


In [2]:
tables = []
for part in range(6):
   dir_name = "D:/nist/asms2023/library/"
   fname = f"{dir_name}/predicted_cho_uniprot_tryptic_2_{part}.arrow"
   print(f"reading {fname}...", end="")
   with pa.memory_map(fname, 'rb') as source:
      iTable = pa.ipc.open_file(source).read_all()
   jTable = iTable.drop_columns(['starts','stops'])
   tables.append(jTable)
   print("done.")
big_table = pa.concat_tables(tables)

reading D:/nist/asms2023/library//predicted_cho_uniprot_tryptic_2_0.arrow...done.
reading D:/nist/asms2023/library//predicted_cho_uniprot_tryptic_2_1.arrow...done.
reading D:/nist/asms2023/library//predicted_cho_uniprot_tryptic_2_2.arrow...done.
reading D:/nist/asms2023/library//predicted_cho_uniprot_tryptic_2_3.arrow...done.
reading D:/nist/asms2023/library//predicted_cho_uniprot_tryptic_2_4.arrow...done.
reading D:/nist/asms2023/library//predicted_cho_uniprot_tryptic_2_5.arrow...done.


In [17]:

dir_name = "D:/nist/asms2023/library/no_ss"
fname = f"{dir_name}/predicted_cho_uniprot_tryptic_2.feather"
with open(fname, 'wb') as f:
    feather.write_feather(big_table, f)

In [2]:
dir_name = "D:/nist/asms2023/library/no_ss"
fname = f"{dir_name}/predicted_cho_uniprot_tryptic_2.feather"
with pa.memory_map(fname, 'rb') as source:
    table = pa.ipc.open_file(source).read_all()
len(table)

In [7]:
big_table.get_total_buffer_size()/(1024*1024*1024)
big_table.shape

(17113904, 38)

In [9]:
big_table.to_batches()[0].num_rows

5000

In [10]:
len(big_table.to_batches())

3423

In [14]:
sortidx = pc.array_sort_indices(big_table['precursor_mz'])
sortidx

<pyarrow.lib.ChunkedArray object at 0x0000023B872F3C20>
[
  [
    16499348,
    15894044,
    16499347,
    15894043,
    16499346,
    ...
    3556980,
    15835053,
    15835086,
    15835087,
    15835089
  ]
]

In [21]:
list(range(0,len(sortidx),5000))[-1]

17110000

In [23]:
len(sortidx.slice(17110000,5000))

3904

In [24]:
dir_name = "D:/nist/asms2023/library/no_ss"
fname = f"{dir_name}/predicted_cho_uniprot_tryptic_2.arrow"
with pa.OSFile(fname, 'wb') as sink:
   with pa.ipc.new_file(sink, big_table.schema) as writer:
    for start in range(0,len(sortidx),5000):
        subset = sortidx.slice(start,5000)
        batch = big_table.take(subset)
        writer.write(batch)
        print(f"Wrote batch {start}:{start+len(batch)}")