In [19]:
#explicitly tests sqlite functions for error cases, these should all pass

import sqlite3
import pandas as pd
import os
from mzsql import *

def setup_test_db_with_convert():
    test_db = "test_database.db"
    mzml_file = "../demo_data/180205_Poo_TruePoo_Full1.mzML"  
    if os.path.exists(test_db):
        os.remove(test_db)
    turn_mzml_sqlite(mzml_file, test_db, ordered="mz")
    return test_db

def teardown_test_db(test_db):
    if os.path.exists(test_db):
        os.remove(test_db)

def test_turn_mzml_sqlite():
    output_db = "output_test.db"
    mzml_file = "../demo_data/180205_Poo_TruePoo_Full1.mzML"
    try:
        turn_mzml_sqlite(mzml_file, output_db, ordered="mz")
        conn = sqlite3.connect(output_db)
        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='MS1'")
        assert cursor.fetchone() is not None, "MS1 table was not created in the database."
        conn.close()
        print("test_turn_mzml_sqlite: PASSED")
    except Exception as e:
        print(f"test_turn_mzml_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(output_db)

def test_get_chrom_sqlite():
    test_db = setup_test_db_with_convert()
    try:
        mz, ppm = 118.0865, 10  # Valid range
        result = get_chrom_sqlite(test_db, mz, ppm)
        assert not result.empty, "Expected non-empty chromatogram result."
        assert (result['mz'] >= mz - mz * ppm / 1e6).all(), "MZ values are below expected range."
        assert (result['mz'] <= mz + mz * ppm / 1e6).all(), "MZ values are above expected range."
        print("test_get_chrom_sqlite: PASSED")
    except Exception as e:
        print(f"test_get_chrom_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(test_db)

def test_get_spec_sqlite():
    test_db = setup_test_db_with_convert()
    try:
        spectrum_idx = 1 
        result = get_spec_sqlite(test_db, spectrum_idx)
        assert not result.empty, "Expected non-empty spectrum data."
        assert (result['id'] == spectrum_idx).all(), "Spectrum IDs do not match the requested index."
        print("test_get_spec_sqlite: PASSED")
    except Exception as e:
        print(f"test_get_spec_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(test_db)

def test_get_rtrange_sqlite():
    test_db = setup_test_db_with_convert()
    try:
        rtstart, rtend = 5.0, 10.0
        result = get_rtrange_sqlite(test_db, rtstart, rtend)
        assert not result.empty, "Expected non-empty result for retention time range."
        assert (result['rt'] >= rtstart).all(), "Retention times are below the expected start range."
        assert (result['rt'] <= rtend).all(), "Retention times exceed the expected end range."
        print("test_get_rtrange_sqlite: PASSED")
    except Exception as e:
        print(f"test_get_rtrange_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(test_db)

# Run tests
if __name__ == "__main__":
    test_turn_mzml_sqlite()
    test_get_chrom_sqlite()
    test_get_spec_sqlite()
    test_get_rtrange_sqlite()


test_turn_mzml_sqlite: PASSED
test_get_chrom_sqlite: PASSED
test_get_spec_sqlite: PASSED
test_get_rtrange_sqlite: PASSED


In [2]:
#explicitly tests sqlite functions for error cases, using bad data. Theses should fail.

import sqlite3
import pandas as pd
import os
from mzsql import *

def setup_test_db_with_convert():
    test_db = "test_database.db"
    mzml_file = "../demo_data/180205_Poo_TruePoo_Full1.mzML"  
    if os.path.exists(test_db):
        os.remove(test_db)
    turn_mzml_sqlite(mzml_file, test_db, ordered="mz")
    return test_db

def teardown_test_db(test_db):
    if os.path.exists(test_db):
        os.remove(test_db)

def test_turn_mzml_sqlite():
    output_db = "output_test.db"
    mzml_file = "../demo_data/180205_Poo_TruePoo_Full1.mzML"
    try:
        turn_mzml_sqlite(mzml_file, output_db, ordered="bad value")
        conn = sqlite3.connect(output_db)
        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='MS1'")
        assert cursor.fetchone() is not None, "MS1 table was not created in the database."
        conn.close()
        print("test_turn_mzml_sqlite: PASSED")
    except Exception as e:
        print(f"test_turn_mzml_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(output_db)

def test_get_chrom_sqlite():
    test_db = setup_test_db_with_convert()
    try:
        mz, ppm = 300, 10 
        result = get_chrom_sqlite(test_db, mz, ppm)
        assert not result.empty, "Expected non-empty chromatogram result."
        assert (result['mz'] >= mz - mz * ppm / 1e6).all(), "MZ values are below expected range."
        assert (result['mz'] <= mz + mz * ppm / 1e6).all(), "MZ values are above expected range."
        print("test_get_chrom_sqlite: PASSED")
    except Exception as e:
        print(f"test_get_chrom_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(test_db)

def test_get_spec_sqlite():
    test_db = setup_test_db_with_convert()
    try:
        spectrum_idx = 10  
        result = get_spec_sqlite(test_db, spectrum_idx)
        assert not result.empty, "Expected non-empty spectrum data."
        assert (result['id'] == spectrum_idx).all(), "Spectrum IDs do not match the requested index."
        print("test_get_spec_sqlite: PASSED")
    except Exception as e:
        print(f"test_get_spec_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(test_db)

def test_get_rtrange_sqlite():
    test_db = setup_test_db_with_convert()
    try:
        rtstart, rtend = 10.0, 5.0
        result = get_rtrange_sqlite(test_db, rtstart, rtend)
        assert not result.empty, "Expected non-empty result for retention time range."
        assert (result['rt'] >= rtstart).all(), "Retention times are below the expected start range."
        assert (result['rt'] <= rtend).all(), "Retention times exceed the expected end range."
        print("test_get_rtrange_sqlite: PASSED")
    except Exception as e:
        print(f"test_get_rtrange_sqlite: FAILED ({e})")
    finally:
        teardown_test_db(test_db)

# Run tests
if __name__ == "__main__":
    test_turn_mzml_sqlite()
    test_get_chrom_sqlite()
    test_get_spec_sqlite()
    test_get_rtrange_sqlite()


test_turn_mzml_sqlite: FAILED (Invalid column for indexing: bad value. Must be one of 'mz', 'int', or 'rt'.)
test_get_chrom_sqlite: FAILED (Expected non-empty chromatogram result.)
test_get_spec_sqlite: FAILED (Expected non-empty spectrum data.)
test_get_rtrange_sqlite: FAILED (Expected non-empty result for retention time range.)


In [27]:
import pandas as pd
from mzsql import *  

def compare_get_chrom_results(file_paths, mz, ppm):
    file_handlers = {
        'mzml': get_chrom_mzml_pymzml,
        'mza': get_chrom_mza,
        'sqlite': get_chrom_sqlite,
        'mzmlb': get_chrom_mzmlb,
        'mz5': get_chrom_mz5,
        'duckdb' : get_chrom_duckdb,
        'mzdb' :get_chrom_mzdb,
    }

    results = {}

    for file_key, file_path in file_paths.items():
        print(f"\nProcessing file: {file_key} ({file_path})")
        try:
            file_extension = file_key.split('_')[0]
            if file_extension in file_handlers:
                results[file_key] = file_handlers[file_extension](file_path, mz, ppm)
            else:
                print(f"Unsupported file type for {file_key}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_key}: {e}")

    print("\nComparison of get_chrom results for each file:")
    for file_key, result in results.items():
        print(f"\nFile: {file_key}")
        if result is not None and not result.empty:
            print("Chromatogram Data (First 5 Rows):")
            print(result.head())
        else:
            print("No data or error encountered.")

file_paths = {
    'sqlite_file': "../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite",
    'mzmlb_file': "../demo_data/180205_Poo_TruePoo_Full1.mzMLB",
    'mzml_file': "../demo_data/180205_Poo_TruePoo_Full1_idx.mzML",
    'duckdb_file': "../demo_data/180205_Poo_TruePoo_Full1.duckdb",
    'mza_file': "../demo_data/180205_Poo_TruePoo_Full1.mza",
    'mz5_file': "../demo_data/180205_Poo_TruePoo_Full1.mz5",
    'mzdb': "../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB",
}

# Parameters for the get_chrom function
mz = 118.086
ppm = 10

# Run the comparison
compare_get_chrom_results(file_paths, mz, ppm)



Processing file: sqlite_file (../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite)

Processing file: mzmlb_file (../demo_data/180205_Poo_TruePoo_Full1.mzMLB)

Processing file: mzml_file (../demo_data/180205_Poo_TruePoo_Full1_idx.mzML)

Processing file: duckdb_file (../demo_data/180205_Poo_TruePoo_Full1.duckdb)

Processing file: mza_file (../demo_data/180205_Poo_TruePoo_Full1.mza)

Processing file: mz5_file (../demo_data/180205_Poo_TruePoo_Full1.mz5)

Processing file: mzdb (../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB)

Comparison of get_chrom results for each file:

File: sqlite_file
Chromatogram Data (First 5 Rows):
   id          mz           int        rt
0   7  118.086632  5.415321e+03  0.061674
1   9  118.086525  1.626918e+06  0.076666
2  11  118.086517  4.403678e+06  0.091470
3  13  118.086533  3.256872e+06  0.108324
4  15  118.086533  1.315048e+06  0.123136

File: mzmlb_file
Chromatogram Data (First 5 Rows):
           mz           int        rt
0  118.086632  5.415321e+0

In [18]:
import pandas as pd
from mzsql import *

def compare_get_spec_results(file_paths, spectrum_id):
    # Map file formats to their corresponding functions
    file_handlers = {
        'mzml': get_spec_mzml_pymzml,
        'mza': get_spec_mza,
        'sqlite': get_spec_sqlite,
        'mzmlb': get_spec_mzmlb,
        'mz5': get_spec_mz5,
        'duckdb': get_spec_duckdb,
        'mzdb' :get_spec_mzdb,
    }

    results = {}

    for file_key, file_path in file_paths.items():
        print(f"\nProcessing file: {file_key} ({file_path})")
        try:
            file_extension = file_key.split('_')[0]
            if file_extension in file_handlers:
                results[file_key] = file_handlers[file_extension](file_path, spectrum_id)
            else:
                print(f"Unsupported file type for {file_key}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_key}: {e}")

    print("\nComparison of get_spec results for each file:")
    for file_key, result in results.items():
        print(f"\nFile: {file_key}")
        if result is not None and not result.empty:
            print("Spectra Data (First 5 Rows):")
            print(result.head())
        else:
            print("No data or error encountered.")

file_paths = {
    'sqlite_file': "../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite",
    'mza_file': "../demo_data/180205_Poo_TruePoo_Full1.mza",
    'mzml_file': "../demo_data/180205_Poo_TruePoo_Full1_idx.mzML",
    'duckdb_file': "../demo_data/180205_Poo_TruePoo_Full1.duckdb",
    'mzmlb_file': "../demo_data/180205_Poo_TruePoo_Full1.mzMLB",
    'mz5_file': "../demo_data/180205_Poo_TruePoo_Full1.mz5",
    'mzdb': "../demo_data/180205_Poo_TruePoo_Full1.mzDB",
}

spectrum_id = 1

# Run the comparison
compare_get_spec_results(file_paths, spectrum_id)



Processing file: sqlite_file (../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite)

Processing file: mza_file (../demo_data/180205_Poo_TruePoo_Full1.mza)

Processing file: mzml_file (../demo_data/180205_Poo_TruePoo_Full1_idx.mzML)

Processing file: duckdb_file (../demo_data/180205_Poo_TruePoo_Full1.duckdb)

Processing file: mzmlb_file (../demo_data/180205_Poo_TruePoo_Full1.mzMLB)

Processing file: mz5_file (../demo_data/180205_Poo_TruePoo_Full1.mz5)

Processing file: mzdb (../demo_data/180205_Poo_TruePoo_Full1.mzDB)

Comparison of get_spec results for each file:

File: sqlite_file
Spectra Data (First 5 Rows):
   id         mz         int        rt
0   1  62.010094  470.188416  0.005402
1   1  64.353043  496.883057  0.005402
2   1  68.521904  547.366028  0.005402
3   1  76.369545  445.280975  0.005402
4   1  77.480682  436.751068  0.005402

File: mza_file
Spectra Data (First 5 Rows):
          mz         int
0  62.010094  470.188416
1  64.353043  496.883057
2  68.521904  547.366028

In [17]:
import pandas as pd
from mzsql import *

def compare_get_rtrange_results(file_paths, start_rt, end_rt):
    # Map file formats to their corresponding functions
    file_handlers = {
        'mzml': get_rtrange_mzml_pymzml,
        'mza': get_rtrange_mza,
        'mzmd': get_rtrange_mzMD,
        'sqlite': get_rtrange_sqlite,
        'mzmlb': get_rtrange_mzmlb,
        'mz5': get_rtrange_mz5,
        'duckdb': get_rtrange_duckdb,
        'mzdb' :get_rtrange_mzdb,
    }

    results = {}

    for file_key, file_path in file_paths.items():
        print(f"\nProcessing file: {file_key} ({file_path})")
        try:
            file_extension = file_key.split('_')[0]
            if file_extension in file_handlers:
                results[file_key] = file_handlers[file_extension](file_path, start_rt, end_rt)
            else:
                print(f"Unsupported file type for {file_key}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_key}: {e}")

    print("\nComparison of get_rtrange results for each file:")
    for file_key, result in results.items():
        print(f"\nFile: {file_key}")
        if result is not None and not result.empty:
            print("Retention Time Range Data (First 5 Rows):")
            print(result.head())
        else:
            print("No data or error encountered.")

file_paths = {
    'sqlite_file': "../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite",
    'mzmlb_file': "../demo_data/180205_Poo_TruePoo_Full1.mzMLB",
    'mzml_file': "../demo_data/180205_Poo_TruePoo_Full1_idx.mzML",
    'duckdb_file': "../demo_data/180205_Poo_TruePoo_Full1.duckdb",
    'mza_file': "../demo_data/180205_Poo_TruePoo_Full1.mza",
    'mz5_file': "../demo_data/180205_Poo_TruePoo_Full1.mz5",
    'mzdb': "../demo_data/180205_Poo_TruePoo_Full1.mzDB",
}

start_rt = 6.5
end_rt = 8.0

# Run the comparison
compare_get_rtrange_results(file_paths, start_rt, end_rt)



Processing file: sqlite_file (../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite)

Processing file: mzmlb_file (../demo_data/180205_Poo_TruePoo_Full1.mzMLB)

Processing file: mzml_file (../demo_data/180205_Poo_TruePoo_Full1_idx.mzML)

Processing file: duckdb_file (../demo_data/180205_Poo_TruePoo_Full1.duckdb)

Processing file: mza_file (../demo_data/180205_Poo_TruePoo_Full1.mza)

Processing file: mz5_file (../demo_data/180205_Poo_TruePoo_Full1.mz5)

Processing file: mzdb (../demo_data/180205_Poo_TruePoo_Full1.mzDB)

Comparison of get_rtrange results for each file:

File: sqlite_file
Retention Time Range Data (First 5 Rows):
    id         mz           int      rt
0  821  60.045113  34041.824219  6.5154
1  821  60.056366   9991.162109  6.5154
2  821  60.058144  11974.023438  6.5154
3  821  60.064445  26377.527344  6.5154
4  821  60.081547   7059.808594  6.5154

File: mzmlb_file
Retention Time Range Data (First 5 Rows):
          mz           int      rt
0  60.045113  34041.824219 

In [30]:
from mzsql import * 
import pandas as pd
import numpy as np

def compare_results_with_functions(file_paths, start_rt, end_rt):
    """
    Compare 'rt', 'mz', and 'int' columns across files using the given functions.
    
    Args:
        file_paths (dict): Dictionary of file keys and file paths.
        start_rt (float): Start of the retention time range.
        end_rt (float): End of the retention time range.
    
    Returns:
        None. Prints comparison results.
    """
    # Map file extensions to their processing functions
    file_handlers = {
        'mzml': get_rtrange_mzml_pymzml,
        'mza': get_rtrange_mza,
        'sqlite': get_rtrange_sqlite,
        'mzmlb': get_rtrange_mzmlb,
        'mz5': get_rtrange_mz5,
        'duckdb': get_rtrange_duckdb,
        'mzdb' :get_rtrange_mzdb,
    }

    results = {}

    # Process each file and store the results
    for file_key, file_path in file_paths.items():
        print(f"\nProcessing file: {file_key} ({file_path})")
        try:
            file_extension = file_key.split('_')[0]
            if file_extension in file_handlers:
                results[file_key] = file_handlers[file_extension](file_path, start_rt, end_rt)
            else:
                print(f"Unsupported file type for {file_key}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_key}: {e}")
            results[file_key] = None

    # Compare the results
    valid_files = {k: df for k, df in results.items() if df is not None and not df.empty}

    if len(valid_files) < 2:
        print("Not enough valid files to perform comparisons.")
        return

    # Normalize column order and sort for consistent comparison
    for key, df in valid_files.items():
        valid_files[key] = df[['rt', 'mz', 'int']].sort_values(by=['rt', 'mz']).reset_index(drop=True)

    # Reference DataFrame for comparison
    ref_key = list(valid_files.keys())[0]
    ref_df = valid_files[ref_key]
    print(f"\nReference file for comparison: {ref_key}\n")

    # Compare each file against the reference
    for file_key, df in valid_files.items():
        if file_key == ref_key:
            continue

        print(f"Comparing '{file_key}' to '{ref_key}':")

        # Ensure the same length
        if len(df) != len(ref_df):
            print(f"  Mismatch in number of rows: {len(df)} vs {len(ref_df)}")
            continue

        # Compare rt, mz, and int columns
        for column in ['rt', 'mz', 'int']:
            if not np.allclose(df[column], ref_df[column], atol=1e-5):
                # Identify mismatches
                mismatch_indices = np.where(~np.isclose(df[column], ref_df[column], atol=1e-5))[0]
                print(f"  Column '{column}' has mismatches at rows: {mismatch_indices}")
                for idx in mismatch_indices[:5]:  # Show up to 5 mismatches
                    print(f"    {column} (ref={ref_df[column][idx]}, file={df[column][idx]})")
            else:
                print(f"  Column '{column}' matches exactly.")

        print("\n")

# File paths and RT range
file_paths = {
    'sqlite_file': "../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite",
    'mzmlb_file': "../demo_data/180205_Poo_TruePoo_Full1.mzMLB",
    'mzml_file': "../demo_data/180205_Poo_TruePoo_Full1_idx.mzML",
    'duckdb_file': "../demo_data/180205_Poo_TruePoo_Full1.duckdb",
    'mza_file': "../demo_data/180205_Poo_TruePoo_Full1.mza",
    'mz5_file': "../demo_data/180205_Poo_TruePoo_Full1.mz5",
    'mzdb': "../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB",
}

start_rt = 6.5
end_rt = 8.0

# Run the comparison
compare_results_with_functions(file_paths, start_rt, end_rt)



Processing file: sqlite_file (../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite)

Processing file: mzmlb_file (../demo_data/180205_Poo_TruePoo_Full1.mzMLB)

Processing file: mzml_file (../demo_data/180205_Poo_TruePoo_Full1_idx.mzML)

Processing file: duckdb_file (../demo_data/180205_Poo_TruePoo_Full1.duckdb)

Processing file: mza_file (../demo_data/180205_Poo_TruePoo_Full1.mza)

Processing file: mz5_file (../demo_data/180205_Poo_TruePoo_Full1.mz5)

Processing file: mzdb (../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB)

Reference file for comparison: sqlite_file

Comparing 'mzmlb_file' to 'sqlite_file':
  Column 'rt' matches exactly.
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'mzml_file' to 'sqlite_file':
  Column 'rt' matches exactly.
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'duckdb_file' to 'sqlite_file':
  Column 'rt' matches exactly.
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'mz

In [28]:
import pandas as pd
from mzsql import *
import numpy as np

def compare_get_spec_results(file_paths, spectrum_id):
    """
    Compare 'mz' and 'int' columns across files for a given spectrum ID.
    
    Args:
        file_paths (dict): Dictionary of file keys and file paths.
        spectrum_id (int): The spectrum ID to retrieve data for.
    
    Returns:
        None. Prints comparison results.
    """
    # Map file formats to their corresponding functions
    file_handlers = {
        'mzml': get_spec_mzml_pymzml,
        'mza': get_spec_mza,
        'sqlite': get_spec_sqlite,
        'mzmlb': get_spec_mzmlb,
        'mz5': get_spec_mz5,
        'duckdb': get_spec_duckdb,
        'mzdb': get_spec_mzdb,
    }

    results = {}

    # Process each file and store results
    for file_key, file_path in file_paths.items():
        print(f"\nProcessing file: {file_key} ({file_path})")
        try:
            file_extension = file_key.split('_')[0]
            if file_extension in file_handlers:
                results[file_key] = file_handlers[file_extension](file_path, spectrum_id)
            else:
                print(f"Unsupported file type for {file_key}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_key}: {e}")
            results[file_key] = None

    # Normalize and compare results
    valid_files = {k: df for k, df in results.items() if df is not None and not df.empty}

    if len(valid_files) < 2:
        print("\nNot enough valid files to perform comparisons.")
        return

    # Normalize column order and sort for consistent comparison
    for key, df in valid_files.items():
        valid_files[key] = df[['mz', 'int']].sort_values(by=['mz']).reset_index(drop=True)

    # Reference DataFrame for comparison
    ref_key = list(valid_files.keys())[0]
    ref_df = valid_files[ref_key]
    print(f"\nReference file for comparison: {ref_key}\n")

    # Compare each file against the reference
    for file_key, df in valid_files.items():
        if file_key == ref_key:
            continue

        print(f"Comparing '{file_key}' to '{ref_key}':")

        # Ensure the same length
        if len(df) != len(ref_df):
            print(f"  Mismatch in number of rows: {len(df)} vs {len(ref_df)}")
            continue

        # Compare 'mz' and 'int' columns
        for column in ['mz', 'int']:
            if not np.allclose(df[column], ref_df[column], atol=1e-5):
                # Identify mismatches
                mismatch_indices = np.where(~np.isclose(df[column], ref_df[column], atol=1e-5))[0]
                print(f"  Column '{column}' has mismatches at rows: {mismatch_indices}")
                for idx in mismatch_indices[:5]:  # Show up to 5 mismatches
                    print(f"    {column} (ref={ref_df[column][idx]}, file={df[column][idx]})")
            else:
                print(f"  Column '{column}' matches exactly.")

        print("\n")

# File paths and spectrum ID
file_paths = {
    'sqlite_file': "../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite",
    'mza_file': "../demo_data/180205_Poo_TruePoo_Full1.mza",
    'mzml_file': "../demo_data/180205_Poo_TruePoo_Full1_idx.mzML",
    'duckdb_file': "../demo_data/180205_Poo_TruePoo_Full1.duckdb",
    'mzmlb_file': "../demo_data/180205_Poo_TruePoo_Full1.mzMLB",
    'mz5_file': "../demo_data/180205_Poo_TruePoo_Full1.mz5",
    'mzdb': "../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB",
}

spectrum_id = 1

# Run the comparison
compare_get_spec_results(file_paths, spectrum_id)



Processing file: sqlite_file (../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite)

Processing file: mza_file (../demo_data/180205_Poo_TruePoo_Full1.mza)

Processing file: mzml_file (../demo_data/180205_Poo_TruePoo_Full1_idx.mzML)

Processing file: duckdb_file (../demo_data/180205_Poo_TruePoo_Full1.duckdb)

Processing file: mzmlb_file (../demo_data/180205_Poo_TruePoo_Full1.mzMLB)

Processing file: mz5_file (../demo_data/180205_Poo_TruePoo_Full1.mz5)

Processing file: mzdb (../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB)

Reference file for comparison: sqlite_file

Comparing 'mza_file' to 'sqlite_file':
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'mzml_file' to 'sqlite_file':
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'duckdb_file' to 'sqlite_file':
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'mzmlb_file' to 'sqlite_file':
  Mismatch in number of rows: 32 vs 34
Comparing 'mz5_file' to 'sql

In [25]:
import pandas as pd
from mzsql import *
import numpy as np

def compare_get_chrom_results(file_paths, mz, ppm):
    """
    Compare 'rt', 'mz', and 'int' columns across files for a given mz and ppm.
    
    Args:
        file_paths (dict): Dictionary of file keys and file paths.
        mz (float): Target mz value.
        ppm (int): Parts-per-million tolerance for mz matching.
    
    Returns:
        None. Prints comparison results.
    """
    file_handlers = {
        'mzml': get_chrom_mzml_pymzml,
        'mza': get_chrom_mza,
        'sqlite': get_chrom_sqlite,
        'mzmlb': get_chrom_mzmlb,
        'mz5': get_chrom_mz5,
        'duckdb': get_chrom_duckdb,
        'mzdb': get_chrom_mzdb,
    }

    results = {}

    # Process each file and store results
    for file_key, file_path in file_paths.items():
        print(f"\nProcessing file: {file_key} ({file_path})")
        try:
            file_extension = file_key.split('_')[0]
            if file_extension in file_handlers:
                results[file_key] = file_handlers[file_extension](file_path, mz, ppm)
            else:
                print(f"Unsupported file type for {file_key}. Skipping.")
        except Exception as e:
            print(f"Error processing {file_key}: {e}")
            results[file_key] = None

    # Normalize and compare results
    valid_files = {k: df for k, df in results.items() if df is not None and not df.empty}

    if len(valid_files) < 2:
        print("\nNot enough valid files to perform comparisons.")
        return

    # Normalize column order and sort for consistent comparison
    for key, df in valid_files.items():
        if key == 'mzdb':
            df['rt'] = df['rt'] / 60  # Convert 'rt' to minutes for mzdb
        valid_files[key] = df[['rt', 'mz', 'int']].sort_values(by=['rt', 'mz']).reset_index(drop=True)

    # Reference DataFrame for comparison
    ref_key = list(valid_files.keys())[0]
    ref_df = valid_files[ref_key]
    print(f"\nReference file for comparison: {ref_key}\n")

    # Compare each file against the reference
    for file_key, df in valid_files.items():
        if file_key == ref_key:
            continue

        print(f"Comparing '{file_key}' to '{ref_key}':")

        # Ensure the same length
        if len(df) != len(ref_df):
            print(f"  Mismatch in number of rows: {len(df)} vs {len(ref_df)}")
            continue

        # Compare 'rt', 'mz', and 'int' columns
        for column in ['rt', 'mz', 'int']:
            if not np.allclose(df[column], ref_df[column], atol=1e-5):
                # Identify mismatches
                mismatch_indices = np.where(~np.isclose(df[column], ref_df[column], atol=1e-5))[0]
                print(f"  Column '{column}' has mismatches at rows: {mismatch_indices}")
                for idx in mismatch_indices[:5]:  # Show up to 5 mismatches
                    print(f"    {column} (ref={ref_df[column][idx]}, file={df[column][idx]})")
            else:
                print(f"  Column '{column}' matches exactly.")

        print("\n")

file_paths = {
    'sqlite_file': "../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite",
    'mzmlb_file': "../demo_data/180205_Poo_TruePoo_Full1.mzMLB",
    'mzml_file': "../demo_data/180205_Poo_TruePoo_Full1_idx.mzML",
    'duckdb_file': "../demo_data/180205_Poo_TruePoo_Full1.duckdb",
    'mza_file': "../demo_data/180205_Poo_TruePoo_Full1.mza",
    'mz5_file': "../demo_data/180205_Poo_TruePoo_Full1.mz5",
    'mzdb': "../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB",
}

mz = 118.086
ppm = 10

# Run the comparison
compare_get_chrom_results(file_paths, mz, ppm)



Processing file: sqlite_file (../demo_data/180205_Poo_TruePoo_Full1_ordered_rt.sqlite)

Processing file: mzmlb_file (../demo_data/180205_Poo_TruePoo_Full1.mzMLB)

Processing file: mzml_file (../demo_data/180205_Poo_TruePoo_Full1_idx.mzML)

Processing file: duckdb_file (../demo_data/180205_Poo_TruePoo_Full1.duckdb)

Processing file: mza_file (../demo_data/180205_Poo_TruePoo_Full1.mza)

Processing file: mz5_file (../demo_data/180205_Poo_TruePoo_Full1.mz5)

Processing file: mzdb (../demo_data/180205_Poo_TruePoo_Full1.raw.mzDB)

Reference file for comparison: sqlite_file

Comparing 'mzmlb_file' to 'sqlite_file':
  Column 'rt' matches exactly.
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'mzml_file' to 'sqlite_file':
  Column 'rt' matches exactly.
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'duckdb_file' to 'sqlite_file':
  Column 'rt' matches exactly.
  Column 'mz' matches exactly.
  Column 'int' matches exactly.


Comparing 'mz