In [1]:
from pathlib import Path
from typing import Union

import datasets
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import pyarrow.parquet as pq
import scipy

from parquet import *

In [2]:
root_path = Path("/cmnfs/proj/prosit_astral/datasets/parquet")
train_path = root_path / "train"

In [8]:
ds = datasets.load_dataset(str(root_path))

Resolving data files:   0%|          | 0/78 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/77 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/23 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [5]:
input_test_file = Path.cwd().parent.parent.parent / "oktoberfest/data/intensity_data.parquet"
raw_data = {
    "intensities": [
        [4e-5, 0., -1., 0., 0., -1., 0.03, 0., -1., 0.4],
        [.3, 0., -1., 1., 0., -1., 0.4, 0., -1., 0.05],
        [.04, 0., 0., 0., 0., 0., 2e-3, 0., 0., .13]
    ],
    "sequence": ["SVFLTFLR", "KTSQIFLAK", "SPVGRVTPKEWR"],
    "precursor_charge_onehot": [
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
    ],
    "collision_energy_normed": [.25, .28, .28]
}

In [42]:
output_path = Path.cwd() / "temp"
output_path.mkdir(exist_ok=True)

df = pd.DataFrame(raw_data)

#df2 = pd.concat([df, df], keys=['1', '2'], names=["dataset", "index"])
df2 = pd.concat([df.assign(dataset='1'), df.assign(dataset='2')])
table = pa.Table.from_pandas(df2)

pq.write_to_dataset(
    table,
    root_path=output_path,
    partition_cols=["dataset"],
    existing_data_behavior="delete_matching",
)

In [43]:
table

pyarrow.Table
intensities: list<item: double>
  child 0, item: double
sequence: string
precursor_charge_onehot: list<item: int64>
  child 0, item: int64
collision_energy_normed: double
dataset: string
__index_level_0__: int64
----
intensities: [[[0.00004,0,-1,0,0,-1,0.03,0,-1,0.4],[0.3,0,-1,1,0,-1,0.4,0,-1,0.05],...,[0.3,0,-1,1,0,-1,0.4,0,-1,0.05],[0.04,0,0,0,0,0,0.002,0,0,0.13]]]
sequence: [["SVFLTFLR","KTSQIFLAK","SPVGRVTPKEWR","SVFLTFLR","KTSQIFLAK","SPVGRVTPKEWR"]]
precursor_charge_onehot: [[[0,1,0,0,0,0],[0,1,0,0,0,0],...,[0,1,0,0,0,0],[0,0,1,0,0,0]]]
collision_energy_normed: [[0.25,0.28,0.28,0.25,0.28,0.28]]
dataset: [["1","1","1","2","2","2"]]
__index_level_0__: [[0,1,2,0,1,2]]

In [44]:
dataset = pq.ParquetDataset(output_path, filters=[("dataset", "=", '1')])
df = dataset.read().to_pandas().drop("dataset", axis=1)
df#.to_pandas()
#read_df = read_partition(output_path, '1')
#pd.testing.assert_frame_equal(read_df, df)

Unnamed: 0,intensities,sequence,precursor_charge_onehot,collision_energy_normed
0,"[4e-05, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, ...",SVFLTFLR,"[0, 1, 0, 0, 0, 0]",0.25
1,"[0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1....",KTSQIFLAK,"[0, 1, 0, 0, 0, 0]",0.28
2,"[0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0....",SPVGRVTPKEWR,"[0, 0, 1, 0, 0, 0]",0.28


In [64]:
df = pd.read_parquet(input_test_file)
df

Unnamed: 0,intensities,sequence,precursor_charge_onehot,collision_energy_aligned_normed
0,"[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....",SVFLTFLR,"[0, 1, 0, 0, 0, 0]",0.25
1,"[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....",KTSQIFLAK,"[0, 1, 0, 0, 0, 0]",0.28
2,"[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...",SPVGRVTPKEWR,"[0, 0, 1, 0, 0, 0]",0.28
3,"[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...",SHIWPEYCSRALR,"[0, 0, 1, 0, 0, 0]",0.30
4,"[0.003340655605539741, 0.0, 0.0, 0.00303169307...",ELESQISELQEDLESERASR,"[0, 0, 1, 0, 0, 0]",0.20
...,...,...,...,...
41587,"[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...",LKFEEITGVINPALDKYFPSDSGVR,"[0, 0, 1, 0, 0, 0]",0.30
41588,"[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...",AYVGLERFLAGLRDY,"[0, 0, 1, 0, 0, 0]",0.35
41589,"[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...",AACLLTKWTAGR,"[0, 0, 1, 0, 0, 0]",0.23
41590,"[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...",SLEKLEIIPASQ,"[1, 0, 0, 0, 0, 0]",0.30


In [65]:
table = pd.concat([df, df], keys=["1", "2"], names=["dataset", "index"])
table

Unnamed: 0_level_0,Unnamed: 1_level_0,intensities,sequence,precursor_charge_onehot,collision_energy_aligned_normed
dataset,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,"[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....",SVFLTFLR,"[0, 1, 0, 0, 0, 0]",0.25
1,1,"[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....",KTSQIFLAK,"[0, 1, 0, 0, 0, 0]",0.28
1,2,"[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...",SPVGRVTPKEWR,"[0, 0, 1, 0, 0, 0]",0.28
1,3,"[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...",SHIWPEYCSRALR,"[0, 0, 1, 0, 0, 0]",0.30
1,4,"[0.003340655605539741, 0.0, 0.0, 0.00303169307...",ELESQISELQEDLESERASR,"[0, 0, 1, 0, 0, 0]",0.20
...,...,...,...,...,...
2,41587,"[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...",LKFEEITGVINPALDKYFPSDSGVR,"[0, 0, 1, 0, 0, 0]",0.30
2,41588,"[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...",AYVGLERFLAGLRDY,"[0, 0, 1, 0, 0, 0]",0.35
2,41589,"[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...",AACLLTKWTAGR,"[0, 0, 1, 0, 0, 0]",0.23
2,41590,"[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...",SLEKLEIIPASQ,"[1, 0, 0, 0, 0, 0]",0.30


In [66]:
out_dir = Path.cwd() / "test"
out_dir.mkdir(exist_ok=True)

In [77]:
pq.write_to_dataset(
    pa.Table.from_pandas(table), root_path=out_dir, partition_cols=["dataset"], existing_data_behavior="delete_matching"
)

In [78]:
dataset = pq.ParquetDataset(out_dir, filters=[("dataset", "=", 1)])
read_df = dataset.read().to_pandas()
read_df = read_df.reset_index(level=0, drop=True).rename_axis(None)

In [85]:
read_df

Unnamed: 0,intensities,sequence,precursor_charge_onehot,collision_energy_aligned_normed
0,"[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....",SVFLTFLR,"[0, 1, 0, 0, 0, 0]",0.25
1,"[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....",KTSQIFLAK,"[0, 1, 0, 0, 0, 0]",0.28
2,"[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...",SPVGRVTPKEWR,"[0, 0, 1, 0, 0, 0]",0.28
3,"[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...",SHIWPEYCSRALR,"[0, 0, 1, 0, 0, 0]",0.30
4,"[0.003340655605539741, 0.0, 0.0, 0.00303169307...",ELESQISELQEDLESERASR,"[0, 0, 1, 0, 0, 0]",0.20
...,...,...,...,...
41587,"[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...",LKFEEITGVINPALDKYFPSDSGVR,"[0, 0, 1, 0, 0, 0]",0.30
41588,"[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...",AYVGLERFLAGLRDY,"[0, 0, 1, 0, 0, 0]",0.35
41589,"[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...",AACLLTKWTAGR,"[0, 0, 1, 0, 0, 0]",0.23
41590,"[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...",SLEKLEIIPASQ,"[1, 0, 0, 0, 0, 0]",0.30


In [80]:
df.equals(read_df)

True