In [None]:
in_folder = "/home/sander/Documents/Proteomics/data/ecoli"
out_folder = "/home/sander/Documents/Proteomics/data/ion_networks/ecoli_sonar/centroided_csvs"
sonar = True
mgf = False
hdmse = False
swim_dia = False

In [None]:
import numpy as np
import pandas as pd
import os
from pyteomics import mgf


def convert_mgf(in_file_name):
    mz1s = []
    mz2s = []
    rts = []
    ints = []
    for spectrum in mgf.read(in_file_name):
        peak_count = len(spectrum["intensity array"]) 
        ints.append(spectrum["intensity array"])
        mz2s.append(spectrum["m/z array"])
        rts.append(np.repeat(spectrum["params"]["rtinseconds"] / 60, peak_count))
        mz1s.append(np.repeat(spectrum["params"]["pepmass"][0], peak_count))
    mz1s = np.concatenate(mz1s)
    mz2s = np.concatenate(mz2s)
    rts = np.concatenate(rts)
    ints = np.log(np.concatenate(ints))
    dimensions = ["MZ2", "RT", "LOGINT", "MZ1"]
    data = np.stack([mz2s, rts, ints, mz1s]).T
    df = pd.DataFrame(data, columns=dimensions)
    return df
    
    
def convert_sonar(in_file_name):
    data = pd.read_csv(
        in_file_name,
        engine="c",
        dtype=np.float,
        usecols=["Function", "m_z", "rt", "mobility", "area"]
    ).values
    data = data[np.searchsorted(data[:,0], 2):, 1:]
    data[:, 2] = np.log(data[:, 2])
    data[:, 3] = 400 + data[:, 3] * (900 - 400) / 200
    dimensions = ["MZ2", "RT", "LOGINT", "MZ1"]
    return pd.DataFrame(data, columns=dimensions)


def convert_hdmse(in_file_name):
    data = pd.read_csv(
        in_file_name,
        engine="c",
        dtype=np.float,
        usecols=["Function", "m_z", "rt", "mobility", "area"]
    ).values
    data = data[np.searchsorted(data[:,0], 2):, 1:]
    data[:, 2] = np.log(data[:, 2])
    dimensions = ["MZ2", "RT", "LOGINT", "DT"]
    return pd.DataFrame(data, columns=dimensions)


def convert_swim_dia(in_file_name):
    data = pd.read_csv(
        in_file_name,
        engine="c",
        dtype=np.float,
        usecols=["Function", "m_z", "rt", "mobility", "area"]
    ).values
    data[:, 2] = np.log(data[:, 2])
    dimensions = ["MZ2", "RT", "LOGINT", "DT"]
    return pd.DataFrame(data, columns=dimensions)

In [None]:
for file_name in sorted(os.listdir(in_folder)):
    if not file_name.endswith(".csv"):
        continue
    print("Processing:", file_name)
    in_file_name = os.path.join(in_folder, file_name)
    out_file_name = os.path.join(out_folder, file_name)
    if mgf:
        data = convert_mgf(in_file_name)
    elif sonar:
        data = convert_sonar(in_file_name)
    elif hdmse:
        data = convert_hdmse(in_file_name)
    elif swim_dia:
        data = convert_swim_dia(in_file_name)
    data.to_csv(out_file_name, index=False)