## Utils

In [2]:
import os, gzip, shutil

'''__get_related_files

DESC:
    Given some substring, return all files with that substring
PARAMS:
    files: list of strings of file names
    sub: string substring to find in the files
OPTIONAL: 
    not_sub: string a substring that if found in file name don't add
'''
def __get_related_files(files, sub, not_sub=None):
    if not_sub is not None and not_sub != '':
        return [x for x in files if sub in x and not_sub not in x]
    return [x for x in files if sub in x]

'''__make_valid_dir_string

DESC:
    add / to end of director string if it doesn't have it alread
PARAMS:
    dir_path: string path to directory
RETURNS:
    dir path with / at end
'''
def __make_valid_dir_string(dir_path):
    return dir_path + '/' if dir_path[-1] != '/' else dir_path

'''__make_dir

DESC:
    Given a path to directory, check if it exists and if not create it
PARAMS:
    dir_path: string path to a directory to make or check
RETURNS:
    None
'''
def __make_dir(dir_path):
    dir_path = __make_valid_dir_string(dir_path)
    if not os.path.exists(dir_path): 
        os.makedirs(dir_path)

'''__make_valid_text_file

DESC:
    make a string into the name for a text file and make sure directory exists
PARAMS:
    file_name: string a name of the file to save
RETURNS:
    file name with .txt after
'''
def __make_valid_text_file(file_name):
    file_name = file_name + '.txt' if '.txt' not in file_name else file_name
    return file_name

'''__make_valid_json_file

DESC:
    make a string into the name for a text file and make sure the directory exists
PARAMS:
    file_name: string of the file to save
RETURNS:
    file name with .json after it
'''
def __make_valid_json_file(file_name):
    file_name = file_name + '.json' if '.json' not in file_name else file_name
    return file_name

'''__make_valid_csv_file

DESC:
    make a string into the name for a text file 
PARAMS:
    file_name: string of the file to save
RETURNS:
    file name with .csv after it
'''
def __make_valid_csv_file(file_name):
    file_name = file_name + '.csv' if '.csv' not in file_name else file_name
    return file_name

'''__make_valid_fasta_file

DESC:
    make a string into the name for a text file 
PARAMS:
    file_name: string of the file to save
RETURNS:
    file name with .fasta after it
'''
def __make_valid_fasta_file(file_name):
    file_name = file_name + '.fasta' if '.fasta' not in file_name else file_name
    return file_name

'''__file_exists

DESC:
    find out if a file exists
PARAMS:
    file_name: string name of the file to check for
RETURNS:
    bool true if file exists false otherwise
'''
def __file_exists(file_name):
    return os.path.isfile(file_name)

'''__gzip

DESC:
    zip up a file
PARAMS: 
    file_name: str path to the file name to compress
OPTIONAL:
    delete_old: bool delete the uncompressed file. Default=True
RETURNS:
    str name of the new compressed file
'''
def __gzip(file_name, delete_old=True):
    compressed_file_name = file_name + '.gz'
    with open(file_name, 'rb') as f_in:
        with gzip.open(compressed_file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    delete_old and os.remove(file_name)
    return compressed_file_name

'''__gunzip

DESC:   
    unzip a file
PARAMS:
    compressed_file_name: str name of the compressed file to unzip
OPTIONAL:
    delete_old: bool delete the compressed file. Default=True
RETURNS:
    str name of the file unziped
'''  
def __gunzip(compressed_file_name, delete_old=True):
    file_name = compressed_file_name if '.gz' not in compressed_file_name else compressed_file_name.replace('.gz', '')
    with gzip.open(compressed_file_name, 'rb') as f_in:
        with open(file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    delete_old and os.remove(compressed_file_name)
    return file_name

'''__is_gzipped

DESC:
    determines if a file has been gzipped
PARAMS:
    file_name: str path to file in question
RETURNS:
    bool True if file is compressed else False
'''
def __is_gzipped(file_name):
    return '.gz' == file_name[-3:]

'''__gzip_dir

DESC:
    compress a directory with gzip
PARAMS:
    d: str path to directory
OPTIONAL:
    delete_old: bool delete the unziped directory. Default=True
RETURNS:
    path to the new zipped folder
'''
def __gzip_dir(d, delete_old=True):
    root = '/'.join(d.split('/')[:-1])
    shutil.make_archive(d, 'zip', root)
    delete_old and shutil.rmtree(d)
    return d + '.zip'

'''__is_json

DESC:
    determine if a file is a json file based purely on name
PARAMS:
    file: file to determine if its a json file
RETURNS:
    bool True if it is a json file False otherwise
'''
def __is_json(file):
    return True if '.json' in file else False

'''__is_fasta

DESC:
    determine if a file is a fasta file based purely on name
PARAMS:
    file: file to determine if its a fasta file
RETURNS:
    bool True if it is a fasta file False otherwise
'''
def __is_fasta(file):
    return True if '.fasta' in file else False

## Testing giving all scores of same value same rank

In [11]:
# (protein, score, position)
correct_position = 3
scores = [
    ('insulin', 2, 3),
    ('bdnf', 2, 3),
    ('caprin', 2, 3),
    ('insulin', 0.5, 1),
    ('caprin', 0, 4)
]

In [12]:
def give_ranks(scores):
    ranking = {}
    scores.sort(reverse=True, key=lambda x: x[1])
    # if scores are all the same, don't increment the rank
    rank = 0
    last_score = None
    for score in scores:
        if int(score[1]) != last_score:
            rank += 1
            last_score = int(score[1])
        if int(score[2]) == correct_position:
            ranking[score[0]] = rank
    return ranking

In [13]:
give_ranks(scores)

{'insulin': 1, 'bdnf': 1, 'caprin': 1}

## Pandas issues

In [19]:
import pandas as pd

score_funcs = ['custom', 'crux']
column_names = {
    'custom': {
        'file_name': 'file',
        'score': 'score',
        'scan_number': 'scan_no'
    },
    'crux': {
        'file_name': 'file',
        'score': 'xcorr score',
        'scan_number': 'scan'
    }
}

def __get_correct_col_names(col_names):
    d_to_l = lambda d: [x for _, x in d.items()]
    for s_func, d in column_names.items():
        ks = d_to_l(d)
        if set.issubset(set(ks), set(col_names)):
            return s_func

def get_scores_scan_pos_label(file, search_substring=''):
    sep = '\t' if '.tsv' in file else ','
    if __is_gzipped(file):
        file = __gunzip(file)

    df = pd.read_csv(file, sep, header=0)
        
    s_func = __get_correct_col_names(df.columns)
    print(s_func)
    print(df.columns)
    col_names = column_names[s_func]

    df = df.sort_values(col_names['score'], ascending=False)
    df = df.drop_duplicates(subset=col_names['scan_number'])
    df = df.sort_values(col_names['scan_number'])
    
    return aligned_scores, [], ''


In [20]:
f = '/Users/zacharymcgrath/Desktop/Experiment output/START SCORING SUM/search_output/insulin_20_vs_peptide_0_index/tide-search.target.tsv'
s = get_scores_scan_pos_label(f)

crux
Index(['file', 'scan', 'charge', 'spectrum precursor m/z',
       'spectrum neutral mass', 'peptide mass', 'delta_cn', 'delta_lcn',
       'xcorr score', 'xcorr rank', 'distinct matches/spectrum', 'sequence',
       'modifications', 'cleavage type', 'protein id', 'flanking aa',
       'target/decoy'],
      dtype='object')


NameError: name 'aligned_scores' is not defined