Skip to content

Commit

Permalink
Merge branch 'master' into production-3.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinkle authored May 26, 2017
2 parents 2cccefc + 13e17ff commit 07ae827
Show file tree
Hide file tree
Showing 12 changed files with 5,298 additions and 228 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,6 @@ To enable:
1. `docker exec -it backend_worker_1 sh` this drops a shell into the rq worker container which has rq-dashboard installed via conda
2. `rq-dashboard -H redis` runs rq-dashboard and specifies the *redis* host automatically defined by docker-compose
3. then on your host machine visit http://localhost:9181

## Blazegraph:
* We are currently running Blazegraph version 2.1.4. If you want to run Blazegraph separately, please use the same version otherwise there may be problems in endpoint urls / returns (namely version 2.1.1). See https://github.com/superphy/backend/issues/63
218 changes: 54 additions & 164 deletions app/modules/beautify/beautify.py
Original file line number Diff line number Diff line change
@@ -1,147 +1,28 @@
import logging
import pandas as pd
import cPickle as pickle
from itertools import tee, izip
from os.path import basename
from modules.loggingFunctions import initialize_logging
from modules.beautify.find_widest import check_alleles


def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return izip(a, b)

def widest(reading_list):
'''
Finds the gene with the widest coverage
args:
reading_list(list(pandas.DataFrame))
return:
(panadas.DataFrame)
'''
#sanity check
if reading_list:
w = reading_list[0]
for element in reading_list:
if abs(element.hitstart - element.hitstop) > abs(w.hitstart - w.hitstop):
w = element
return w
else:
return reading_list

def overlap(row2, reading_window):
'''
returns true is either end (ie. anypart) of row2 overlaps with the reading_window
'''
row2_min_overlaps = reading_window['min'] <= min(row2.hitstart,row2.hitstop) <= reading_window['max']
row2_max_overlaps = reading_window['min'] <= max(row2.hitstart,row2.hitstop) <= reading_window['max']
return row2_min_overlaps or row2_max_overlaps

def check_alleles_multiple(hits, new_hits):
'''
checks for multiple hits of the same gene and appends to new_hits. also strips out overlap
'''
##sanity chcek
if hits.empty:
return hits

#this checks for alleles overlap
hits.sort_values(['analysis','filename','contigid','hitname','hitstart','hitstop'], inplace=True)

# set the reading_frame to the first row
reading_list = []
reading_window = {'min':min(hits.iloc[0].hitstart,hits.iloc[0].hitstop),'max':max(hits.iloc[0].hitstart,hits.iloc[0].hitstop)}

for (i1, row1), (i2, row2) in pairwise(hits.iterrows()):
if row1.analysis != row2.analysis:
# at intersection between two hits
at_intersection = True
elif row1.filename != row2.filename:
at_intersection = True
elif row1.contigid != row2.contigid:
at_intersection = True
elif row1.hitname != row2.hitname:
at_intersection = True
elif not overlap(row2, reading_window):
#is not overlap, then at this pt we're are a 2nd non-overlapping (& possibly doubly expressed) occurance of the gene
at_intersection = True
else:
at_intersection = False

if at_intersection:
if not reading_list:
#ie reading_list is empty
# in this case since we're already at an intersection, then row1 is unique
new_hits.append(dict(row1))
else:
new_hits.append(dict(widest(reading_list)))
reading_list = []
reading_window['min'] = min(row2.hitstart, row2.hitstop)
reading_window['max'] = max(row2.hitstart, row2.hitstop)
else:
#ie we found an overlap
#expand the reading_window
reading_window['min']=min(reading_window['min'],row2.hitstart,row2.hitstop)
reading_window['max']=max(reading_window['max'],row2.hitstart,row2.hitstop)
reading_list.append(row2)

#check for end of iteration
if cmp(dict(row2),dict(hits.iloc[-1])) == 0:
new_hits.append(dict(widest(reading_list)))

return new_hits

def weird_name(subq,subp):
'''
returns true if either value is a weird name and short be ignored
'''
t = ('st','tia')
return (subq in t) or (subp in t)

def substring_cut(hits):
'''
iterrows should return deep copies, not sure if this will work properly
'''
for i1, row1 in hits.iterrows():
subframe = hits.loc[hits.index>i1]
for i2, row2 in subframe.iterrows():
if ((row1.hitname.lower() in row2.hitname.lower()) or (row2.hitname.lower() in row1.hitname.lower())) and not weird_name(row1.hitname, row2.hitname):
if len(row1.hitname) > len(row2.hitname):
hits.loc[i1,'hitname']=row2.hitname
elif len(row1.hitname) < len(row2.hitname):
hits.loc[i2, 'hitname']=row1.hitname
return hits

def check_alleles(gene_dict):
#we are working with the new dict format that is directly converted to json
hits = pd.DataFrame(gene_dict)
new_hits = []

# we're not interested in checking serotype, so we drop it
if 'Serotype' in hits.analysis.unique():
new_hits.append(dict(hits[hits['analysis']=='Serotype'].iloc[0]))
hits = hits[hits['analysis'] != 'Serotype']

#we've update the db for VF so an allele check is only needed for AMR
if 'Antimicrobial Resistance' in hits.analysis.unique():
#strip allele info from data
# assumes if an underscore is in a gene name, that anything after the underscore refers to an allele
hits['hitname'] = hits['hitname'].apply(lambda x: x.split('_')[0].split('-I')[0].split('-V')[0])
hits = substring_cut(hits)

#this checks for alleles overlap
new_hits = check_alleles_multiple(hits, new_hits)
return new_hits

# logging
log_file = initialize_logging()
log = logging.getLogger(__name__)

def json_return(args_dict, gene_dict):
"""
this controls the actual return to Redis (& hence the result polled by the frontend)
This converts the gene dict into a json format for return to the front end
"""
log.info('args_dict: ' + str(args_dict))
log.info('gene_dict: ' + str(gene_dict))
json_r = []

# strip gene_dicts that user doesn't want to see
# remember, we want to run all analysis on our end so we have that data in blazegraph
d = dict(gene_dict)

#log.info('Results Gene Dict: ' + str(d))

for analysis in gene_dict:
if analysis == 'Serotype' and not args_dict['options']['serotype']:
del d['Serotype']
Expand All @@ -151,7 +32,7 @@ def json_return(args_dict, gene_dict):
del d['Virulence Factors']
gene_dict = d


log.info('After deletion from gene_dict: ' + str(gene_dict))

for analysis in gene_dict:
if analysis == 'Serotype':
Expand Down Expand Up @@ -187,9 +68,9 @@ def json_return(args_dict, gene_dict):
else:
instance_dict['hitcutoff'] = args_dict['pi']
json_r.append(instance_dict)
return json_r

json_r = check_alleles(json_r)

def has_failed(json_r):
# check if we tried to beautify a failed analysis
failed = False
if isinstance(json_r, list):
Expand All @@ -198,35 +79,33 @@ def json_return(args_dict, gene_dict):
elif isinstance(json_r,pd.DataFrame):
if json_r.empty:
failed = True

# if we beautified a failed analysis add this info to return
if failed:
ret = []
instance_dict = {}
instance_dict['filename'] = basename(args_dict['i'])[27:]
instance_dict['contigid'] = 'n/a'
#instance_dict['analysis'] = analysis
instance_dict['hitname'] = 'No Results Found.'
instance_dict['hitorientation'] = 'n/a'
instance_dict['hitstart'] = 'n/a'
instance_dict['hitstop'] = 'n/a'
instance_dict['hitcutoff'] = 'n/a'

if not args_dict['disable_serotype']:
t = dict(instance_dict)
t.update({'analysis':'Serotype'})
ret.append(t)
if not args_dict['disable_vf']:
t = dict(instance_dict)
t.update({'analysis':'Virulence Factors'})
ret.append(t)
if not args_dict['disable_amr']:
t = dict(instance_dict)
t.update({'analysis':'Antimicrobial Resistance'})
ret.append(t)
return ret
else:
return json_r
return failed

def handle_failed(json_r, args_dict):
ret = []
instance_dict = {}
instance_dict['filename'] = basename(args_dict['i'])[27:]
instance_dict['contigid'] = 'n/a'
#instance_dict['analysis'] = analysis
instance_dict['hitname'] = 'No Results Found.'
instance_dict['hitorientation'] = 'n/a'
instance_dict['hitstart'] = 'n/a'
instance_dict['hitstop'] = 'n/a'
instance_dict['hitcutoff'] = 'n/a'

if not args_dict['options']['serotype']:
t = dict(instance_dict)
t.update({'analysis':'Serotype'})
ret.append(t)
if not args_dict['options']['vf']:
t = dict(instance_dict)
t.update({'analysis':'Virulence Factors'})
ret.append(t)
if not args_dict['options']['amr']:
t = dict(instance_dict)
t.update({'analysis':'Antimicrobial Resistance'})
ret.append(t)
return ret

def beautify(args_dict, pickled_dictionary):
'''
Expand All @@ -237,4 +116,15 @@ def beautify(args_dict, pickled_dictionary):
:return: json representation of the results, as required by the front-end.
'''
gene_dict = pickle.load(open(pickled_dictionary, 'rb'))
return json_return(args_dict, gene_dict)
# this converts our dictionary structure into json and adds metadata (filename, etc.)
json_r = json_return(args_dict, gene_dict)
log.info('First parse into json_r: ' + str(json_r))
# if looking for only serotype, skip this step
if args_dict['options']['vf'] or args_dict['options']['amr']:
json_r = check_alleles(json_r)
log.info('After checking alleles json_r: ' + str(json_r))
# check if there is an analysis module that has failed in the result
if has_failed(json_r):
return handle_failed(json_r, args_dict)
else:
return json_r
Loading

0 comments on commit 07ae827

Please sign in to comment.