In [5]:
# bulk database export data with multiple same annotation lines. This script can convert multiple lines into one line.
# the final format should be for example LPE 16:0_C21H44NO7P or LPE O-16:1;O_C21H44NO7P for multiple annotations.

import numpy as np
import pandas as pd

bulk_data = pd.read_table("PC-bulk-data-merged.tsv")
# need to convert all data to string so that the drop function can work.
bulk_data = bulk_data.astype('str')
bulk_data.drop_duplicates(subset=None, keep="first", inplace=True)
bulk_data_mass_list = bulk_data['Input Mass'].tolist()

print(bulk_data)
print(bulk_data_mass_list)

     Input Mass Matched Mass   Delta         Name     Formula     Ion  \
0       468.308     468.3085  0.0005     LPC 14:0  C22H46NO7P  [M+H]+   
1       468.308     468.3085  0.0005    PC O-14:0  C22H46NO7P  [M+H]+   
2       468.309     468.3085  0.0005     LPC 14:0  C22H46NO7P  [M+H]+   
3       468.309     468.3085  0.0005    PC O-14:0  C22H46NO7P  [M+H]+   
4       480.345     480.3448  0.0002   LPC O-16:1  C24H50NO6P  [M+H]+   
...         ...          ...     ...          ...         ...     ...   
1141    854.537      854.533   0.004   PC 41:11;O  C49H76NO9P  [M+H]+   
1142    854.538      854.533   0.005   PC 41:11;O  C49H76NO9P  [M+H]+   
1144    862.624      862.632   0.008  PC O-42:7;O  C50H88NO8P  [M+H]+   
1145    882.569     882.5643  0.0047   PC 43:11;O  C51H80NO9P  [M+H]+   
1146     882.57     882.5643  0.0057   PC 43:11;O  C51H80NO9P  [M+H]+   

                                          LMSD Examples  
0     https://lipidmaps.org/resources/tools/chemdb_o...  
1     h

In [6]:
# remove duplicates but keep the order of list.
bulk_data_mass_list_no_dup = list(dict.fromkeys(bulk_data_mass_list))
annotation_full_list = []

for mass in bulk_data_mass_list_no_dup:
    subset_df = bulk_data[bulk_data["Input Mass"].isin([mass])]
    print(subset_df)
    subset_df_formula_list = subset_df['Name'].tolist()
    annotation_full = str()
    if len(subset_df_formula_list) != 1 and len(list(set(subset_df_formula_list))) != 1:
        print("multiple annotations for " + str(mass))
        for subset_index in range(len(subset_df.index.tolist())):
            annotation_single = str(subset_df.iloc[subset_index]['Name']) + "_" + str(subset_df.iloc[subset_index]['Formula'])
            print(annotation_single)
            annotation_full += annotation_single
            if subset_index < len(subset_df.index.tolist()) - 1:
                annotation_full += " or "
        print(annotation_full)
        annotation_full_list.append(annotation_full)
    elif len(subset_df_formula_list) != 1 and len(list(set(subset_df_formula_list))) == 1:
        annotation_single = str(subset_df.iloc[0]['Name']) + "_" + str(subset_df.iloc[0]['Formula'])
        annotation_full_list.append(annotation_single)
        print(str(mass) + " same annotation but multiple times")
    else:
        annotation_single = str(subset_df.iloc[0]['Name']) + "_" + str(subset_df.iloc[0]['Formula'])
        annotation_full_list.append(annotation_single)
        print("only have one annotation")

    Input Mass Matched Mass   Delta          Name     Formula     Ion  \
0      468.308     468.3085  0.0005      LPC 14:0  C22H46NO7P  [M+H]+   
1      468.308     468.3085  0.0005     PC O-14:0  C22H46NO7P  [M+H]+   
359    468.308     468.3085  0.0005  LPC O-14:1;O  C22H46NO7P  [M+H]+   

                                         LMSD Examples  
0    https://lipidmaps.org/resources/tools/chemdb_o...  
1    https://lipidmaps.org/resources/tools/chemdb_o...  
359                                                nan  
multiple annotations for 468.308
LPC 14:0_C22H46NO7P
PC O-14:0_C22H46NO7P
LPC O-14:1;O_C22H46NO7P
LPC 14:0_C22H46NO7P or PC O-14:0_C22H46NO7P or LPC O-14:1;O_C22H46NO7P
    Input Mass Matched Mass   Delta          Name     Formula     Ion  \
2      468.309     468.3085  0.0005      LPC 14:0  C22H46NO7P  [M+H]+   
3      468.309     468.3085  0.0005     PC O-14:0  C22H46NO7P  [M+H]+   
361    468.309     468.3085  0.0005  LPC O-14:1;O  C22H46NO7P  [M+H]+   

                 

807                                                nan  
multiple annotations for 778.537
PC 36:6_C44H76NO8P
PC O-36:7;O_C44H76NO8P
PC 36:6_C44H76NO8P or PC O-36:7;O_C44H76NO8P
    Input Mass Matched Mass   Delta         Name     Formula     Ion  \
187    780.553     780.5538  0.0008      PC 36:5  C44H78NO8P  [M+H]+   
812    780.553     780.5538  0.0008  PC O-36:6;O  C44H78NO8P  [M+H]+   

                                         LMSD Examples  
187  https://lipidmaps.org/resources/tools/chemdb_o...  
812                                                nan  
multiple annotations for 780.553
PC 36:5_C44H78NO8P
PC O-36:6;O_C44H78NO8P
PC 36:5_C44H78NO8P or PC O-36:6;O_C44H78NO8P
    Input Mass Matched Mass   Delta         Name     Formula     Ion  \
188    782.567     782.5694  0.0024      PC 36:4  C44H80NO8P  [M+H]+   
814    782.567     782.5694  0.0024  PC O-36:5;O  C44H80NO8P  [M+H]+   

                                         LMSD Examples  
188  https://lipidmaps.org/resources/tool

In [7]:
final_df = pd.DataFrame(list(zip(bulk_data_mass_list_no_dup, annotation_full_list)), columns=['mass', 'annotation'])
final_df.to_csv('PC-bulk-data.csv', index=False)

In [8]:
annotation_full_list

['LPC 14:0_C22H46NO7P or PC O-14:0_C22H46NO7P or LPC O-14:1;O_C22H46NO7P',
 'LPC 14:0_C22H46NO7P or PC O-14:0_C22H46NO7P or LPC O-14:1;O_C22H46NO7P',
 'LPC O-16:1_C24H50NO6P',
 'LPC O-16:0_C24H52NO6P',
 'LPC O-16:0_C24H52NO6P',
 'LPC 16:1_C24H48NO7P or LPC O-16:2;O_C24H48NO7P',
 'LPC 16:1_C24H48NO7P or LPC O-16:2;O_C24H48NO7P',
 'LPC 16:0_C24H50NO7P or PC O-16:0_C24H50NO7P or LPC O-16:1;O_C24H50NO7P',
 'LPC 16:0_C24H50NO7P or PC O-16:0_C24H50NO7P or LPC O-16:1;O_C24H50NO7P',
 'PC 16:4_C24H40NO8P',
 'LPC 17:2_C25H48NO7P or LPC O-17:3;O_C25H48NO7P',
 'LPC O-18:2_C26H52NO6P',
 'LPC O-18:2_C26H52NO6P',
 'LPC 17:1_C25H50NO7P or LPC O-17:2;O_C25H50NO7P',
 'LPC O-18:1_C26H54NO6P',
 'PC 16:0_C24H48NO8P or LPC 16:1;O_C24H48NO8P',
 'PC 16:0_C24H48NO8P or LPC 16:1;O_C24H48NO8P',
 'LPC 17:0_C25H52NO7P or PC O-17:0_C25H52NO7P or LPC O-17:1;O_C25H52NO7P',
 'LPC 17:0_C25H52NO7P or PC O-17:0_C25H52NO7P or LPC O-17:1;O_C25H52NO7P',
 'LPC O-18:0_C26H56NO6P',
 'LPC 18:3_C26H48NO7P or LPC O-18:4;O_C26H48N

In [None]:
my_list = [1, 2, 3, 4, 5]
result = ""

for element in my_list:
    result += str(element)

print(result)

In [None]:
my_list = ["apple", "banana", "orange", "grape"]
result = ""

for index, element in enumerate(my_list):
    result += element
    if index < len(my_list) - 1:
        result += " and "

print(result)