# Utils

In [4]:
import os, gzip, shutil

'''__get_related_files

DESC:
    Given some substring, return all files with that substring
PARAMS:
    files: list of strings of file names
    sub: string substring to find in the files
OPTIONAL: 
    not_sub: string a substring that if found in file name don't add
'''
def __get_related_files(files, sub, not_sub=None):
    if not_sub is not None and not_sub != '':
        return [x for x in files if sub in x and not_sub not in x]
    return [x for x in files if sub in x]

'''__make_valid_dir_string

DESC:
    add / to end of director string if it doesn't have it alread
PARAMS:
    dir_path: string path to directory
RETURNS:
    dir path with / at end
'''
def __make_valid_dir_string(dir_path):
    return dir_path + '/' if dir_path[-1] != '/' else dir_path

'''__make_dir

DESC:
    Given a path to directory, check if it exists and if not create it
PARAMS:
    dir_path: string path to a directory to make or check
RETURNS:
    None
'''
def __make_dir(dir_path):
    dir_path = __make_valid_dir_string(dir_path)
    if not os.path.exists(dir_path): 
        os.makedirs(dir_path)

'''__make_valid_text_file

DESC:
    make a string into the name for a text file and make sure directory exists
PARAMS:
    file_name: string a name of the file to save
RETURNS:
    file name with .txt after
'''
def __make_valid_text_file(file_name):
    file_name = file_name + '.txt' if '.txt' not in file_name else file_name
    return file_name

'''__make_valid_json_file

DESC:
    make a string into the name for a text file and make sure the directory exists
PARAMS:
    file_name: string of the file to save
RETURNS:
    file name with .json after it
'''
def __make_valid_json_file(file_name):
    file_name = file_name + '.json' if '.json' not in file_name else file_name
    return file_name

'''__make_valid_csv_file

DESC:
    make a string into the name for a text file 
PARAMS:
    file_name: string of the file to save
RETURNS:
    file name with .csv after it
'''
def __make_valid_csv_file(file_name):
    file_name = file_name + '.csv' if '.csv' not in file_name else file_name
    return file_name

'''__make_valid_fasta_file

DESC:
    make a string into the name for a text file 
PARAMS:
    file_name: string of the file to save
RETURNS:
    file name with .fasta after it
'''
def __make_valid_fasta_file(file_name):
    file_name = file_name + '.fasta' if '.fasta' not in file_name else file_name
    return file_name

'''__file_exists

DESC:
    find out if a file exists
PARAMS:
    file_name: string name of the file to check for
RETURNS:
    bool true if file exists false otherwise
'''
def __file_exists(file_name):
    return os.path.isfile(file_name)

'''__gzip

DESC:
    zip up a file
PARAMS: 
    file_name: str path to the file name to compress
OPTIONAL:
    delete_old: bool delete the uncompressed file. Default=True
RETURNS:
    str name of the new compressed file
'''
def __gzip(file_name, delete_old=True):
    compressed_file_name = file_name + '.gz'
    with open(file_name, 'rb') as f_in:
        with gzip.open(compressed_file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    delete_old and os.remove(file_name)
    return compressed_file_name

'''__gunzip

DESC:   
    unzip a file
PARAMS:
    compressed_file_name: str name of the compressed file to unzip
OPTIONAL:
    delete_old: bool delete the compressed file. Default=True
RETURNS:
    str name of the file unziped
'''  
def __gunzip(compressed_file_name, delete_old=True):
    file_name = compressed_file_name if '.gz' not in compressed_file_name else compressed_file_name.replace('.gz', '')
    with gzip.open(compressed_file_name, 'rb') as f_in:
        with open(file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    delete_old and os.remove(compressed_file_name)
    return file_name

'''__is_gzipped

DESC:
    determines if a file has been gzipped
PARAMS:
    file_name: str path to file in question
RETURNS:
    bool True if file is compressed else False
'''
def __is_gzipped(file_name):
    return '.gz' == file_name[-3:]

'''__gzip_dir

DESC:
    compress a directory with gzip
PARAMS:
    d: str path to directory
OPTIONAL:
    delete_old: bool delete the unziped directory. Default=True
RETURNS:
    path to the new zipped folder
'''
def __gzip_dir(d, delete_old=True):
    root = '/'.join(d.split('/')[:-1])
    shutil.make_archive(d, 'zip', root)
    delete_old and shutil.rmtree(d)
    return d + '.zip'

'''__is_json

DESC:
    determine if a file is a json file based purely on name
PARAMS:
    file: file to determine if its a json file
RETURNS:
    bool True if it is a json file False otherwise
'''
def __is_json(file):
    return True if '.json' in file else False

'''__is_fasta

DESC:
    determine if a file is a fasta file based purely on name
PARAMS:
    file: file to determine if its a fasta file
RETURNS:
    bool True if it is a fasta file False otherwise
'''
def __is_fasta(file):
    return True if '.fasta' in file else False

In [5]:
import json

def load_experiment(f):
    return json.load(open(f, 'r'))

# Spectra generation

In [6]:
amino_acids={
    "A":71.037114,
    "R":156.101111,
    "N":114.042927,
    "D":115.026943,
    "C":103.009185,
    "E":129.042593,
    "Q":128.058578,
    "G":57.021464,
    "H":137.058912,
    "I":113.084064,
    "L":113.084064,
    "K":128.094963,
    "M":131.040485,
    "F":147.068414,
    "P":97.052764,
    "S":87.032028,
    "T":101.047679,
    "U":150.95363,
    "W":186.079313,
    "Y":163.06332,
    "V":99.068414
}

'''calc_masses

DESC:
    calculates the masses/spectrum for a sequence
PARAMS:
    sequence: str amino acid sequence to change to list of masses
    charge: int charge value to calculate masses for
RETURNS:
    list of floats, float       spectrum and the precursor mass 
'''
def calc_masses(sequence, charge):
    masses = []

    length = len(sequence)
    total = 2 * 1.007825035 + 15.99491463 #This is the mass of water. Adding the mass of water to the sum of all the residue masses gives the mass of the peptide.
    for i in range(length):
        total +=  amino_acids[sequence[i]]

    pre_mz = (total+charge*1.0072764)/charge   

    if charge == 1:
        #b+
        total = 1.007825035 - 0.0005486 #for the H to turn the residue NH on the N-terminus into NH2
        for i in range (0, length):
            total += amino_acids[sequence[i]]
            masses.append(total)
            #Since z (the charge) is equal to one, the total here is the m/z

        #y+
        total = 3 * 1.007825035 + 15.99491463 - 0.0005486 #for the OH to turn the residue CO on the C-terminus into COOH + 1 proton to make NH into NH2 and 1 proton make positively charged
        for i in range (0,length):
            total += amino_acids[sequence[length-i-1]]
            masses.append(total)

    elif charge == 2:
        #b++
        total = 2 * 1.007825035 - 2 * 0.0005486 #adding one more proton this time to make it doubly charged
        for i in range (0, length):
            total += amino_acids[sequence[i]]
            masses.append(total/2)

        #y++
        total = 4 * 1.007825035 + 15.99491463 - 2 * 0.0005486 #another proton to make doubly charged
        for i in range (0, length):
            total += amino_acids[sequence[length-i-1]]
            masses.append(total/2)
        #The masses you get exactly match Spectrum Mill. To get this, I had to make sure to use the mass of H+ and the mass of H when appropriate.

    return masses, pre_mz

'''gen_spectra

DESC:
    generates mass spectra for sequences
PARAMS:
    sequences: list of strings sequences to generate spectra for
RETURNS:
    list of dictionaries of the form {'spectrum': list of floats, 'precursor_mass': float}
'''
def gen_spectra(sequenences):
    spectra = []
    for sequence in sequenences:
        this_entry = {}
        mass_1, _ = calc_masses(sequence, 1)
        mass_2, pre_mz = calc_masses(sequence, 2)
        this_spectra = mass_1 + mass_2
        this_spectra.sort()
        this_entry['spectrum'] = this_spectra
        this_entry['precursor_mass'] = pre_mz
        spectra.append(this_entry)

    return spectra

# Spectra writing

In [7]:
from pyopenms import MSExperiment, MSSpectrum, MzMLFile, Peak1D, Precursor

'''write_mzml

DESC:
    create a mass spectrum file in mzml of sequences
PARAMS:
    file_name: str name to save the file in
    spectra: list of dictionaries of the form [{spectrum: list[floats], precursor_mass: float, sequence: str}]
             This data is written to file (the spectrum and the precursor)
OPTIONAL:
    title_prefix: str name to give as prefix to the name of each spectrum. Default=Spectrum
    output_dir: str name of the directory to save files to. Default=./
    compress: bool whether or not to compress the file. Comrpesses with gzip. Default=True
RETURNS:
    list of strings of file paths
'''
def write_mzml(file_name, spectra, title_prefix='Spectrum ', output_dir='./', compress=True):
    if '.mzml' not in file_name.lower():
        file_name += '.mzML'
    output_dir = __make_valid_dir_string(output_dir)
    __make_dir(output_dir)
    output_file = output_dir + file_name

    exp = MSExperiment()
    sp_count = 0
    for spectrum in spectra:
        spec = MSSpectrum()
        spec.setMSLevel(2)
        name = str.encode(title_prefix + str(sp_count))
        spec.setName(name)
        sp_count += 1
        
        i = [500 for _ in spectrum['spectrum']]
        spec.set_peaks([spectrum['spectrum'], i])
        spec.setMSLevel(2)
        prec = Precursor()
        prec.setCharge(2)
        prec.setMZ(spectrum['precursor_mass'])
        spec.setPrecursors([prec])
        spec.sortByPosition()
        exp.addSpectrum(spec)

    MzMLFile().store(output_file, exp)
    output_file = __gzip(output_file) if compress else output_file

    return output_file

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


# Crux scoring

In [8]:
from subprocess import call

crux_to_rm = ['tide-search.decoy.txt', 'tide-search.log.txt', 'tide-search.params.txt']

def __parse_spectrum_name(spec_name):
    return str(spec_name.split('/')[-1]).lower().replace('.mzml', '')

def __parse_db_name(db_name):
    return  str(db_name.split('/')[-1]).replace('.fasta', '')

def __index_db_files(path_to_crux_cmd, db_files):
    idx_names = []
    num_dbs = len(db_files)

    for i, db_file in enumerate(db_files):
        print('On database {}/{}[{}%]\r'.format(i+1, num_dbs, int(((i+1)/num_dbs) * 100)), end="")
        
        this_output_dir = '/'.join(str(db_file).split('/')[:-1])
        this_output_dir = __make_valid_dir_string(this_output_dir) + 'indexed/'
        __make_dir(this_output_dir)
        idx_name = this_output_dir + str(db_file).replace('.fasta', '_index').split('/')[-1]

        indx_cmd = [
            path_to_crux_cmd, 
            'tide-index', 
            db_file, 
            idx_name, 
            '--min-length', '2', 
            '--min-mass', '50', 
            '--output-dir', this_output_dir, 
            '--overwrite', 'T', 
            '--min-peaks', '2', 
            '--precursor-window', '1000000000',
            '--enzyme', 'no-enzyme', 
            '--verbosity', '0'
        ]
        call(indx_cmd)
        # remove extra output files
        os.remove(this_output_dir + 'tide-index.params.txt')
        os.remove(this_output_dir + 'tide-index.log.txt')

        idx_names.append(idx_name)
    return idx_names

def __remove_indices(index_file):
    if isinstance(index_file, list):
        index_file = index_file[0]
    rm_dir = '/'.join(index_file.split('/')[:-1])
    shutil.rmtree(rm_dir)


'''crux_search

DESC:
    use the crux tool to score spectra against databases
PARAMS:
    spectra_files: list of str paths to all the spectra (.mzML) files
    database_files: list of str paths to all the database (.fasta) files
    path_to_crux_cmd: str path to the executable for crux.
    output_dir: str path to the directory to save files
OPTIONAL:
    compress: bool compress the output result. Default=True
RETURNS:
    list of str of output files
'''
def __crux_search(spectra_files, database_files, path_to_crux_cmd, output_dir, compress=True):
    output_dir = __make_valid_dir_string(output_dir) + 'search_output/'
    __make_dir(output_dir)
    spec_dir = '/'.join(spectra_files[0].split('/')[:-1])

    is_compressed = __is_gzipped(spectra_files[0])
    print('Pre-indexing database files...')
    indexed_db_files = __index_db_files(path_to_crux_cmd, database_files)
    print('\nDone. Scoring..')

    output_count = 0
    output_files = []
    num_dbs = len(indexed_db_files)
    num_specs = len(spectra_files)

    for i, spec_file in enumerate(spectra_files):
        spec_file = spec_file if not is_compressed else __gunzip(spec_file)
        for j, database_file in enumerate(indexed_db_files):
            this_db_name = __parse_db_name(database_file)
            print('On spectrum {}/{}[{}%]\tOn database file {}/{}[{}%]\r'.format(i+1, num_specs, int(((i+1)/num_specs) * 100), j+1, num_dbs, int(((j+1)/num_dbs)*100)), end="")
            this_output_dir = output_dir + '{}_vs_{}'.format(__parse_spectrum_name(spec_file), this_db_name)
            search_cmd = [
                path_to_crux_cmd, 
                'tide-search', 
                spec_file, 
                database_file, 
                '--min-length', '2', 
                '--min-mass', '50', 
                '--output-dir', this_output_dir, 
                '--overwrite', 'T', 
                '--min-peaks', '2', 
                '--precursor-window', '1000000000',
                '--enzyme', 'no-enzyme', 
                '--verbosity', '0'
                ] 
            call(search_cmd)
            output_count += 1
            o = this_output_dir + '/tide-search.target.txt' if not compress else __gzip(this_output_dir + '/tide-search.target.txt')
            o_tsv = o.replace('.txt', '.tsv')
            os.rename(o, o_tsv)
            output_files.append(o_tsv)

            # saving space, so should remove the extra stuff
            if is_compressed:
                for rm in crux_to_rm:
                    os.remove(this_output_dir + '/' + rm)

        # if the files were compressed, we were trying to save disk space so just remove the mzml files
        # is_compressed and os.remove(spec_file)

    # is_compressed and os.rmdir(spec_dir)
    __remove_indices(indexed_db_files)
    return output_files

# Load crux search results

### NOTE: the (int) after a protein name is the start index of the sequence, 1 based not 0, so subtract 1

In [9]:
import pandas as pd

desired_cols = ['file', 'xcorr score', 'sequence', 'protein id', 'target/decoy', 'xcorr rank']

def load_search_result(f):
    if __is_gzipped(f):
        f = __gunzip(f)
    df = pd.read_csv(f, '\t', header=0)
    return df[desired_cols]

# Add hybrid proteins to fasta

In [10]:
def add_hyb_prots(fasta_file, prots):
    with open(fasta_file, 'a') as o:
        o.write('\n')
        c = 0
        for p in prots:
            if 'hybrid' in p['name'].lower():
                o.write('>sp|idsomething{}|{}\n'.format(c, p['name']))
                o.write(p['protein'] + '\n')
                c += 1

---
---
# START RUNNING FROM HERE

In [76]:
import copy

experiment_file = '/Users/zacharymcgrath/Desktop/Experiment output/small prots 1000 peps 5 hybs zscoresum crux scoring func/experiment_data.json'
save_dir = __make_valid_dir_string('/Users/zacharymcgrath/Desktop/Experiment output/small prots 1000 peps 5 hybs crux run')

exp = load_experiment(experiment_file)
peptides = exp['experiment_info']['peptides']
keyed_peps = {}
for pep in peptides:
    keyed_peps[pep['peptide_name']] = copy.deepcopy(pep)
print('Done')

Done


# Generate spectra files

In [77]:
spec_files = []

spec_save_dir = __make_valid_dir_string(__make_valid_dir_string(save_dir) + 'spectra')
__make_dir(spec_save_dir)

pep_c = 0
pep_num = len(peptides)
for pep in peptides:
    print('Generating spectra {}/{}[{}%]\r'.format(pep_c, pep_num, int( (float(pep_c)/(pep_num)) * 100)), end="")
    pep_c += 1
    
    name = pep['peptide_name']
    seq = pep['peptide_sequence']
    spectra = gen_spectra([seq])
    spec_files.append(write_mzml(name, spectra, output_dir=spec_save_dir))
print('\nDone')


Generating spectra 1004/1005[99%]
Done


# Run crux search on spectra files

In [78]:
fasta_file = '/Users/zacharymcgrath/Documents/Layer_Research/Proteomics_Experiments/Database_Experiments/test-protein.fasta'
path_to_crux_cmd = '/Users/zacharymcgrath/Documents/Layer_Research/crux/bin/crux'

# add hybrid proteins to file before we run search
add_hyb_prots(fasta_file, exp['experiment_info']['proteins'])

score_files = __crux_search(spec_files, [fasta_file], path_to_crux_cmd, save_dir)
print('\nDone')

Pre-indexing database files...
On database 1/1[100%]
Done. Scoring..
On spectrum 1005/1005[100%]	On database file 1/1[100%]
Done


# Next steps

Ok by this point we have files with the scores for every peptide against the one database. We can get their names now and reconstruct this. We need to do the following:
1. find out which protein crux said it was from 
2. Determine if it was the the proper position
3. If more than 1 candidate, then filter that out
4. construct a list of scores for a protein/peptide pair to see if it properly scored or not. For situations where there was no peptide generated, rank = 0. If the correct position or protein was not found, give a score of -1

In [79]:
dfs = []

for sf in score_files:
    dfs.append(load_search_result(sf))
    
df = pd.concat(dfs, ignore_index=True, sort =False)
print('Done')
# print(df)

Done


# Organize search results

In [80]:
search_results = {}

def take_name(s):
        
    rm_num = lambda x: x.split('(')[0]
    rm_bar = lambda x: x.split('|')[2]
    
    return [rm_num(rm_bar(y)) for y in s.split(',')]
        

for p in peptides:
    name = p['peptide_name']
    search_results[name] = []
    
    take_start_pos = lambda s: int(s.split('(')[1].split(')')[0])
    for i, row in df.iterrows():
        if name in row['file'] and row['target/decoy'] != 'decoy':
            search_results[name].append({
                'protein': take_name(row['protein id']),
                'starting_position': take_start_pos(row['protein id']),
                'sequence': row['sequence'],
                'peptide_name': name,
                'rank': row['xcorr rank']
            })
print('Done')
# print(search_results)
        

Done


# Organize by parent protein

In [81]:
ps = exp['experiment_info']['proteins']
prots = {}
for p in ps:
    prots[p['name']] = p['sequence'] if 'sequence' in p else p['protein']

sorted_results = {}
for p in prots: 
    sorted_results[p] = []

for peptide in search_results:
    for r in search_results[peptide]:
        for prot in r['protein']:
            sorted_results[prot].append(r)
print('Done')
# print(sorted_results)

Done


# Compress results
For each protein, we want to know if the peptide associated with that result was scored correctly or not, and update some sort of array. This list should be the length of the protein sequence

In [82]:
# wrong_parent = 'r'
# wrong_loc = 'y'
hit = 'g'
no_data = 'r'

protein_peptide_data = {}

prot_c = 0
prot_num = len(prots)
for prot in prots:
    print('On protein {}/{}[{}%]\r'.format(prot_c, prot_num, int( (float(prot_c)/(prot_num)) * 100)), end="")
    prot_c += 1
    
    prot_data = [[] for _ in range(len(prots[prot]))]
    for r in sorted_results[prot]:
        # get the REAL peptide data
        info = keyed_peps[r['peptide_name']]
        
        if prot in r['protein']:
            result_start_idx = int(r['starting_position']) -1
            # we have the correct parent protein
            if prot == info['parent_name']:
                # we have the right starting position
                if int(info['start_index']) == result_start_idx: #-1 becuase they use 1 based counting
                    prot_data[info['start_index']].append((hit, int(r['rank']), r['peptide_name']))
                    
# THESE ARE COMMENTED OUT TO REDUCE THE NUMBER OF DATA POINTS WE SHOW
#                 else:
#                     prot_data[result_start_idx].append((wrong_loc, int(r['rank'])))
                    
#             # we don't have the correct parent protein
#             else:
#                 prot_data[result_start_idx].append((wrong_parent, int(r['rank'])))
                
    protein_peptide_data[prot] = prot_data
print('\nDone')
#print(protein_peptide_data)

On protein 0/9[0%]On protein 1/9[11%]On protein 2/9[22%]On protein 3/9[33%]On protein 4/9[44%]On protein 5/9[55%]On protein 6/9[66%]On protein 7/9[77%]On protein 8/9[88%]
Done


# Peptide cleanup

go through every peptide and check the correct parent protein at the correct position and see if it managed to find it. If not give it the no data and a -1. Reduce the number of points. 

In [83]:
def has_correct(prot_data, pos):
    rs = prot_data[pos]
    return any([r[0] == hit for r in rs])

number_missed = 0
pep_c = 0
pep_num = len(keyed_peps)
for pep_name, pep_info in keyed_peps.items():
    print('On peptide {}/{}[{}%]\r'.format(pep_c, pep_num, int( (float(pep_c)/(pep_num)) * 100)), end="")
    pep_c += 1
    
    pos = pep_info['start_index']
    corr_parent = pep_info['parent_name']
    # if we don't have a point in the right starting position, add a point at it saying its wrong
    if not has_correct(protein_peptide_data[corr_parent], pos):
        protein_peptide_data[corr_parent][pos].append((no_data, -1))
        number_missed += 1
    # remove any duplicates from this peptide and take only the highest rank
    else:
        pep_points = [x for x in protein_peptide_data[corr_parent][pos] if x[2] == pep_name]
        pep_points.sort(key=lambda x: x[1])
        to_remove = [] if len(pep_points) < 2 else pep_points[1:]
        for rm in to_remove:
            protein_peptide_data[corr_parent][pos].remove(rm)
        
print('\n{}/{} peptides did not get correct hit'.format(number_missed, len(keyed_peps)))

On peptide 0/1005[0%]On peptide 1/1005[0%]On peptide 2/1005[0%]On peptide 3/1005[0%]On peptide 4/1005[0%]On peptide 5/1005[0%]On peptide 6/1005[0%]On peptide 7/1005[0%]On peptide 8/1005[0%]On peptide 9/1005[0%]On peptide 10/1005[0%]On peptide 11/1005[1%]On peptide 12/1005[1%]On peptide 13/1005[1%]On peptide 14/1005[1%]On peptide 15/1005[1%]On peptide 16/1005[1%]On peptide 17/1005[1%]On peptide 18/1005[1%]On peptide 19/1005[1%]On peptide 20/1005[1%]On peptide 21/1005[2%]On peptide 22/1005[2%]On peptide 23/1005[2%]On peptide 24/1005[2%]On peptide 25/1005[2%]On peptide 26/1005[2%]On peptide 27/1005[2%]On peptide 28/1005[2%]On peptide 29/1005[2%]On peptide 30/1005[2%]On peptide 31/1005[3%]On peptide 32/1005[3%]On peptide 33/1005[3%]On peptide 34/1005[3%]On peptide 35/1005[3%]On peptide 36/1005[3%]On peptide 37/1005[3%]On peptide 38/1005[3%]On peptide 39/1005[3%]On peptide 40/1005[3%]On peptide 41/1005[4%]On peptide 42/1005[4%]On peptide 43/1005[4%

# Plot these bad bois

In [84]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib import gridspec
from math import ceil
from statistics import mean

max_aas = 75
plot_width = 14

legend_els = [
    Line2D([0], [0], color='r', lw=4, label='wrong parent'),
    #Line2D([0], [0], color='y', lw=4, label='right parent wrong location'),
    Line2D([0], [0], color='g', lw=4, label='correct')
    #Line2D([0], [0], color='c', lw=4, label='correct hit missing')
]

prot_plot_save_dir = __make_valid_dir_string(__make_valid_dir_string(save_dir) + 'protein_plots/')
__make_dir(prot_plot_save_dir)

prot_c = 0
prot_num = len(protein_peptide_data)
for prot in protein_peptide_data:
    prot_c += 1
    print('On protein {}/{}[{}%]\r'.format(prot_c, prot_num, int( (float(prot_c)/(prot_num)) * 100)), end="")
    num_subplots = ceil(len(prots[prot]) / max_aas)
    fig, ax = plt.subplots(num_subplots, figsize=(plot_width, 2.5 * num_subplots))
    
    
    for i in range(num_subplots):
        this_axis = ax[i]
        # get the data needed for this row
        end = min([((i+1) * max_aas), len(prots[prot])])
        this_data = protein_peptide_data[prot][i*max_aas:end]
        this_label = prots[prot][i*max_aas:end]
        for x in range(len(this_data)):
            if len(this_data[x]) > 0:
                # boxplot this
                to_box_plot = [x[1] for x in this_data[x]]
                color = 'r' if mean(to_box_plot) < 0 else 'g'
                bplot = this_axis.boxplot(to_box_plot, positions=[x])
                # color it
                for attr in ['boxes', 'whiskers', 'caps', 'fliers', 'medians']:
                    for item in bplot[attr]:
                        item.set(color=color)
                
#                 for points in this_data[x]:
#                     this_axis.plot([x], [points[1]], marker='o', color=points[0])
                    
        this_axis.set_xticks([j for j in range(len(this_data))])
        this_axis.set_xticklabels(this_label)
        this_axis.set_yticks([-2, -1, 0, 1, 2, 3, 4, 5, 6])
        this_axis.set_yticklabels(['', 'not found', '0', '1', '2', '3', '4', '5', ''])
        y_label = 'sequence:  {} - {}'.format(i * max_aas, end - 1)
        this_axis.set_ylabel(y_label, fontsize=8)
        this_axis.legend(handles=legend_els, loc='center left', bbox_to_anchor=(1, 0.5))
    fig.suptitle(prot)
    plot_save_name = prot_plot_save_dir + prot
    fig.savefig(plot_save_name)
    #fig.show()
    plt.close()
print('\nDone')

On protein 9/9[100%]
Done
