In [1]:
from project_directories import deepstabp_ext, deepstabp_col

import os, csv
import os.path as osp
import numpy as np

def_header = ['Protein', 'species', 'asymptote', 'mid', 'm', 'Tm', 'r2']

ogt_dict = {
    'Oleispira antarctica_RB-8_lysate_R1': [3,'Oleispira antarctica'],
    'Caenorhabditis elegans lysate': [20,'Caenorhabditis elegans'],
    'Arabidopsis thaliana seedling lysate': [22,'Arabidopsis thaliana'], 
    'Drosophila melanogaster SII lysate': [25,'Drosophila melanogaster'], 
    'Danio rerio Zenodo lysate': [28,'Danio rerio'],
    'Saccharomyces cerevisiae lysate': [32,'Saccharomyces cerevisiae'],
    'Saccharomyces cerevisiae cell': [32,'Saccharomyces cerevisiae'],
    'Bacillus subtilis_168_lysate_R1': [37,'Bacillus subtilis'],
    'Escherichia coli cells': [37,'Escherichia coli'],
    'Escherichia coli lysate': [37,'Escherichia coli'],
    'Homo sapiens Jurkat cells': [37,'Homo sapiens'],
    'Homo sapiens K562 cells': [37,'Homo sapiens'],
    'Homo sapiens K562 cells_Thermal_proximity_coaggregationhermal_proximity_coaggregation': [37,'Homo sapiens'],
    'Homo sapiens K562 lysate': [37,'Homo sapiens'],
    'Homo sapiens K562 lysate_Thermal_proximity_coaggregationhermal_proximity_coaggregation': [37,'Homo sapiens'],
    'Mus musculus BMDC lysate': [37,'Mus musculus'],
    'Mus musculus liver lysate': [37,'Mus musculus'],
    'Geobacillus stearothermophilus NCA26 lysate': [55,'Geobacillus stearothermophilus'],
    'Picrophilus torridus DSM9790 lysate': [60,'Picrophilus torridus'],
    'Thermus thermophilus HB27 cells': [70,'Thermus thermophilus'],
    'Thermus thermophilus HB27 lysate': [70,'Thermus thermophilus']
}

def gather_entries(files, save_path=None):
    '''Gather all entries from a list of files.

    Computes the average Tm for any duplicate proteins.

    Parameters
    ----------
    files : list of str
        List of file paths to read.
    save_path : str, optional
        If specified, save the gathered entries to this path.

    Returns
    -------
    all_entries : list of list of str
        List of entries from all files.
    '''

    print(f'Reading {len(files)} files.\n')

    ####################################################################
    # READ AND GATHER ALL ENTRIES ACROSS FILES
    ####################################################################
    all_entries = []
    for file in files:
        with open(file, 'r') as f:
            reader = csv.reader(
                f,
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL,
                skipinitialspace=True
            )
            file_content = list(reader)

            ### UNIFY COLUMN ORDER
            # find header indices
            file_header = file_content[0]
            header_idx = [file_header.index(h) for h in def_header]

            # reorder data
            file_entries = []
            for row in file_content[1:]:
                file_entries.append(
                    [row[idx] for idx in header_idx]
                     + ogt_dict[osp.basename(file.replace('.csv', ''))]
                )
                # print(ogt_dict[osp.basename(file.replace('.csv', ''))])

            print(f'{len(file_entries)} entries in "{osp.basename(file)}"')

        all_entries += file_entries
    
    print(f'\nTotal entries: {len(all_entries)}\n')

    ### save all columns of all entries
    if save_path is not None:
        with open(save_path.replace('.csv', '-all_columns.csv'),
                  'w+') as f:
            writer = csv.writer(
                f,
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL,
                skipinitialspace=True
            )
            writer.writerow(def_header+['ogt', 'organism'])
            writer.writerows(all_entries)

    ####################################################################
    # FIND DUPLICATES
    ####################################################################

    ### duplicate by protein name
    # (some entries include gene name in identifier)
    unique_proteins, count = np.unique(
        [e[0] for e in all_entries],
        return_counts=True
    )
    duplicate_proteins = unique_proteins[count > 1]
    print(f'Unique proteins: {unique_proteins.size}')
    print(f'Duplicate proteins: {duplicate_proteins.size}\n')

    # entries with duplicates grouped together
    dup_entries = [e for a in duplicate_proteins for e in all_entries
                   if e[0]==a]
    # save duplicate entries
    if save_path is not None:
        with open(save_path.replace('.csv', '-duplicates_by_identifier.csv'),
                  'w+') as f:
            writer = csv.writer(
                f,
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL,
                skipinitialspace=True
            )
            writer.writerow(def_header+['ogt'])
            writer.writerows(dup_entries)

    ### duplicate by UniProt accession
    # (removes gene name if present)
    unique_accessions, count = np.unique(
        [e[0][:e[0].index('_')] if '_' in e[0] else e[0]
         for e in all_entries],
        return_counts=True
    )
    duplicate_accessions = unique_accessions[count > 1]
    print(f'Unique accessions: {unique_accessions.size}')
    print(f'Duplicate accessions: {duplicate_accessions.size}\n')

    # entries with duplicates grouped together
    dup_entries = []
    for accession in duplicate_accessions:
        for entry in all_entries:
            if '_' in entry[0]:
                entry_accession = entry[0][:entry[0].index('_')]
            else:
                entry_accession = entry[0]
            if entry_accession == accession:
                dup_entries.append(entry)
    # save duplicate entries
    if save_path is not None:
        with open(save_path.replace('.csv', '-duplicates_by_accession.csv'),
                  'w+') as f:
            writer = csv.writer(
                f,
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL,
                skipinitialspace=True
            )
            writer.writerow(def_header+['ogt'])
            writer.writerows(dup_entries)

    ### additionally save entries with more than 3 duplicates by accession
    dup3_accessions = unique_accessions[count > 2]
    print(f'Accessions with >2 duplicates: {dup3_accessions.size}')

    # entries with duplicates grouped together
    dup_entries = []
    for accession in dup3_accessions:
        for entry in all_entries:
            if '_' in entry[0]:
                entry_accession = entry[0][:entry[0].index('_')]
            else:
                entry_accession = entry[0]
            if entry_accession == accession:
                dup_entries.append(entry)
    # save duplicate entries
    if save_path is not None:
        with open(save_path.replace('.csv',
                                    '-over_3_duplicates_by_accession.csv'),
                  'w+') as f:
            writer = csv.writer(
                f,
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL,
                skipinitialspace=True
            )
            writer.writerow(def_header+['ogt'])
            writer.writerows(dup_entries)

    ####################################################################
    # SAVE PROCESSED ENTRIES READY FOR USE BY PYG DATASET
    ####################################################################

    final_entries = []
    multiple_species = {}
    multiple_ogt = {}
    for accession in unique_accessions:
        # find all entries with this accession
        matching_entries = [e for e in all_entries 
                            if e[0].startswith(accession)]

        # find average Tm
        Tm = np.mean([float(e[5]) for e in matching_entries])
        
        species = np.unique([str(e[8]) for e in matching_entries])
        if species.size > 1:
            multiple_species[accession] = species
            print(accession)
            print(species)
            continue

        ogt = np.unique([float(e[7]) for e in matching_entries])
        if ogt.size > 1:
            multiple_ogt[accession] = ogt
            print(accession)
            print(ogt)
            continue

        final_entries.append([accession, Tm, ogt[0], species[0]])

    print()
    print(multiple_species)
    print(multiple_ogt)

    # save to file
    with open(save_path, 'w+') as f:
        writer = csv.writer(
            f,
            quotechar='"',
            quoting=csv.QUOTE_MINIMAL,
            skipinitialspace=True
        )
        writer.writerow(['accession', 'Tm', 'ogt', 'species'])
        writer.writerows(final_entries)

filenames = [f for f in os.listdir(deepstabp_ext) if '.csv' in f.lower()]

In [2]:
# gather all lysate entries
save_path = osp.join(deepstabp_col, 'lysate.csv')

files = [osp.join(deepstabp_ext, f) for f in filenames
         if 'lysate' in f.lower()]
entries = gather_entries(files, save_path)

Reading 15 files.

2199 entries in "Mus musculus liver lysate.csv"
161 entries in "Danio rerio Zenodo lysate.csv"
1122 entries in "Oleispira antarctica_RB-8_lysate_R1.csv"
3608 entries in "Homo sapiens K562 lysate.csv"
5321 entries in "Mus musculus BMDC lysate.csv"
2980 entries in "Caenorhabditis elegans lysate.csv"
639 entries in "Geobacillus stearothermophilus NCA26 lysate.csv"
6668 entries in "Homo sapiens K562 lysate_Thermal_proximity_coaggregationhermal_proximity_coaggregation.csv"
685 entries in "Thermus thermophilus HB27 lysate.csv"
1566 entries in "Drosophila melanogaster SII lysate.csv"
1295 entries in "Bacillus subtilis_168_lysate_R1.csv"
2156 entries in "Arabidopsis thaliana seedling lysate.csv"
1920 entries in "Saccharomyces cerevisiae lysate.csv"
843 entries in "Picrophilus torridus DSM9790 lysate.csv"
1561 entries in "Escherichia coli lysate.csv"

Total entries: 32724

Unique proteins: 31666
Duplicate proteins: 1058

Unique accessions: 29758
Duplicate accessions: 2966

Ac

In [3]:
# gather all cell entries
save_path = osp.join(deepstabp_col, 'cell.csv')

files = [osp.join(deepstabp_ext, f) for f in filenames if 'cell' in f.lower()]
entries = gather_entries(files, save_path)

Reading 6 files.

6684 entries in "Homo sapiens K562 cells_Thermal_proximity_coaggregationhermal_proximity_coaggregation.csv"
1329 entries in "Escherichia coli cells.csv"
693 entries in "Thermus thermophilus HB27 cells.csv"
4033 entries in "Homo sapiens K562 cells.csv"
532 entries in "Saccharomyces cerevisiae cell.csv"
6392 entries in "Homo sapiens Jurkat cells.csv"

Total entries: 19663

Unique proteins: 11568
Duplicate proteins: 5048

Unique accessions: 11568
Duplicate accessions: 5048

Accessions with >2 duplicates: 3014

{}
{}


In [4]:
# gather all human cell entries
save_path = osp.join(deepstabp_col, 'cell-human.csv')

files = [osp.join(deepstabp_ext, f) for f in filenames
         if 'homo' in f.lower() and 'cell' in f.lower()]
entries = gather_entries(files, save_path)

Reading 3 files.

6684 entries in "Homo sapiens K562 cells_Thermal_proximity_coaggregationhermal_proximity_coaggregation.csv"
4033 entries in "Homo sapiens K562 cells.csv"
6392 entries in "Homo sapiens Jurkat cells.csv"

Total entries: 17109

Unique proteins: 9014
Duplicate proteins: 5048

Unique accessions: 9014
Duplicate accessions: 5048

Accessions with >2 duplicates: 3014

{}
{}


In [5]:
# gather all human lysate entries
save_path = osp.join(deepstabp_col, 'lysate-human.csv')

files = [osp.join(deepstabp_ext, f) for f in filenames
         if 'homo' in f.lower() and 'lysate' in f.lower()]
entries = gather_entries(files, save_path)

Reading 2 files.

3608 entries in "Homo sapiens K562 lysate.csv"
6668 entries in "Homo sapiens K562 lysate_Thermal_proximity_coaggregationhermal_proximity_coaggregation.csv"

Total entries: 10276

Unique proteins: 10276
Duplicate proteins: 0

Unique accessions: 8383
Duplicate accessions: 1893

Accessions with >2 duplicates: 0

{}
{}


In [6]:
# gather all human lysate k562 entries
save_path = osp.join(deepstabp_col, 'lysate-human-k562.csv')

files = [osp.join(deepstabp_ext, f) for f in filenames
         if 'homo' in f.lower() and 'lysate' in f.lower() and 'k562' in f.lower()]
entries = gather_entries(files, save_path)

Reading 2 files.

3608 entries in "Homo sapiens K562 lysate.csv"
6668 entries in "Homo sapiens K562 lysate_Thermal_proximity_coaggregationhermal_proximity_coaggregation.csv"

Total entries: 10276

Unique proteins: 10276
Duplicate proteins: 0

Unique accessions: 8383
Duplicate accessions: 1893

Accessions with >2 duplicates: 0

{}
{}


In [7]:
# gather all human cell k562 entries
save_path = osp.join(deepstabp_col, 'cell-human-k562.csv')

files = [osp.join(deepstabp_ext, f) for f in filenames
         if 'homo' in f.lower() and 'cell' in f.lower() and 'k562' in f.lower()]
entries = gather_entries(files, save_path)

Reading 2 files.

6684 entries in "Homo sapiens K562 cells_Thermal_proximity_coaggregationhermal_proximity_coaggregation.csv"
4033 entries in "Homo sapiens K562 cells.csv"

Total entries: 10717

Unique proteins: 7397
Duplicate proteins: 3307

Unique accessions: 7397
Duplicate accessions: 3307

Accessions with >2 duplicates: 13

{}
{}


In [8]:
# gather all human cell jurkat entries
save_path = osp.join(deepstabp_col, 'cell-human-jurkat.csv')

files = [osp.join(deepstabp_ext, f) for f in filenames
         if 'homo' in f.lower() and 'cell' in f.lower() and 'jurkat' in f.lower()]
entries = gather_entries(files, save_path)

Reading 1 files.

6392 entries in "Homo sapiens Jurkat cells.csv"

Total entries: 6392

Unique proteins: 6361
Duplicate proteins: 31

Unique accessions: 6361
Duplicate accessions: 31

Accessions with >2 duplicates: 0

{}
{}
