Neo4J v053 Test2 - Sample Based Analysis v02 (Add relationship and dstat messages)
================

## Set up the environment


### Install py2neo for querying Neo4J 

In [196]:
#!pip3 install py2neo

### Import Packages

In [197]:
from py2neo import Graph
import pandas as pd
import numpy as np
import subprocess
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

#plt.style.use('fivethirtyeight')
#plt.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Load Neo4J DB

In [198]:
graph = Graph("bolt://35.230.80.57:7687", auth=("neo4j", "bZjcc1XTd71ft2oVGj7A8aU8UkttdBhRAecV4x6USC3wpeOLmCmfCAH2bAMllQQlwtzCbzLuX1TgoX-Enc7MUA"))

## Generate Sample Status Table

### The number of nodes (Fastq, Ubam, Vcf, Cram, Crai) 

In [199]:
# Fastq
query = "Match (j:Json:FromPersonalis), (f:Fastq) WHERE f.sample = j.sample RETURN j.sample AS sample, count(f) AS fastq"
num_fastq = graph.run(query).to_data_frame()
num_fastq.set_index('sample')

# Ubam
query = "Match (j:Json:FromPersonalis), (u:Ubam) WHERE u.sample = j.sample RETURN DISTINCT j.sample AS sample, count(u) AS ubam"
num_ubam = graph.run(query).to_data_frame()
num_ubam.set_index('sample')

# Vcf
query = "Match (j:Json:FromPersonalis), (v:Merged:Vcf) WHERE v.sample = j.sample RETURN DISTINCT j.sample AS sample, count(v) AS vcf"
num_vcf = graph.run(query).to_data_frame()
num_vcf.set_index('sample')

# Cram
query = "Match (j:Json:FromPersonalis), (cm:Cram) WHERE cm.sample = j.sample RETURN DISTINCT j.sample AS sample, count(cm) AS cram"
num_cram = graph.run(query).to_data_frame()
num_cram.set_index('sample')

# Crai
query = "Match (j:Json:FromPersonalis), (ci:Crai) WHERE ci.sample = j.sample RETURN DISTINCT j.sample AS sample, count(ci) AS crai"
num_crai = graph.run(query).to_data_frame()
num_crai.set_index('sample')
print("")

Failed to write data to connection ('35.230.80.57', 7687) (Address(host='35.230.80.57', port=7687)); ("104; 'Connection reset by peer'")





### The number of jobs (FQ2U, GATK) 

In [200]:
# Fq2u
query = "Match (j:Json:FromPersonalis), (e:Job:Dsub {name:'fastq-to-ubam'}) WHERE e.sample = j.sample RETURN j.sample AS sample, count(e) AS fq2u"
num_fq2u = graph.run(query).to_data_frame()
num_fq2u.set_index('sample')

# Gatk
query = "Match (j:Json:FromPersonalis), (g:Job:Cromwell) WHERE g.sample = j.sample RETURN j.sample AS sample, count(g) AS gatk"
num_gatk = graph.run(query).to_data_frame()
num_gatk.set_index('sample')
print("")




### Merge all node and job dfs to one df

In [201]:
#-- only nodes
#sample_qc_df=num_fastq.merge(num_ubam, how='outer').merge(num_vcf, how='outer').merge(num_cram, how='outer').merge(num_crai, how='outer')
#columnlist=["sample","fastq","ubam","vcf","cram","crai"]

#-- nodes and jobs
sample_qc_df=num_fastq.merge(num_ubam, how='outer').merge(num_vcf, how='outer').merge(num_cram, how='outer').merge(num_crai, how='outer').merge(num_fq2u,how='outer').merge(num_gatk,how='outer')
columnlist=["sample","fastq","ubam","vcf","cram","crai","fq2u","gatk"]
numsample=len(sample_qc_df)
print("The number of samples : " + str(numsample) + "\n")

sample_qc_df=sample_qc_df[columnlist]
sample_qc_df.fillna(0,inplace=True)
#sample_qc_df.head(3)

The number of samples : 1330



### Classification based on sample status and success.

In [202]:
##-- Passed

#- 1. Successful jobs
sample_qc_df.loc[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&(sample_qc_df['fq2u']==sample_qc_df['fastq']/2)&(sample_qc_df['gatk']==1),'status']="success"
#- 2. Duplicated jobs
sample_qc_df.loc[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&((sample_qc_df['fq2u']>sample_qc_df['fastq']/2)|(sample_qc_df['gatk']>1)),'status']="duplicated jobs"
#- 3. Duplicated jobs and nodes
sample_qc_df.loc[((sample_qc_df['ubam']>sample_qc_df['fastq']/2)|(sample_qc_df['vcf']>1)|(sample_qc_df['cram']>1) \
                                 |(sample_qc_df['crai']>1))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)|(sample_qc_df['gatk']>=1)),'status']="duplicated nodes"
#- Success 
sample_qc_df.loc[(sample_qc_df['status'].isin(["success", "duplicated jobs", "duplicated nodes"])), 'success'] = "pass"


##-- failed

#- 4. failed fq2u jobs
sample_qc_df.loc[((sample_qc_df['ubam']<sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==0)&(sample_qc_df['cram']==0) \
                                   &(sample_qc_df['crai']==0))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)|(sample_qc_df['gatk']==0)),'status']="failed fq2u"
#- 5. failed gatk jobs
sample_qc_df.loc[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&((sample_qc_df['vcf']<1)|(sample_qc_df['cram']<1) \
                                   |(sample_qc_df['crai']<1))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)&(sample_qc_df['gatk']>=1)),'status']="failed gatk"
#- 6. no gatk jobs
sample_qc_df.loc[((sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==0)&(sample_qc_df['cram']==0) \
                                   &(sample_qc_df['crai']==0))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)&(sample_qc_df['gatk']==0)),'status']="no gatk"
#- Fail
sample_qc_df.loc[(sample_qc_df['status'].isin(["failed fq2u", "failed gatk", "no gatk"])), 'success'] = "fail"

##-- Check unclassified samples.
num_unclassified = len(sample_qc_df[sample_qc_df.status.isna()==True])
print("The number of unclassified samples : " + str(num_unclassified)+"\n")

if num_unclassified != 0 :
    display(sample_qc_df[sample_qc_df.status.isna()==True])
    

The number of unclassified samples : 0



In [203]:
##-- Display of this table
pd.set_option('display.float_format', lambda x: '%.f' % x)

display(sample_qc_df[sample_qc_df['status']=='success'].head(2))
display(sample_qc_df[sample_qc_df['status']=='duplicated jobs'].head(2))
display(sample_qc_df[sample_qc_df['status']=='duplicated nodes'].head(2))
display(sample_qc_df[sample_qc_df['status']=='failed fq2u'].head(2))
display(sample_qc_df[sample_qc_df['status']=='no gatk'].head(2))
display(sample_qc_df[sample_qc_df['status']=='failed gatk'].head(2))

Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success
0,SHIP4946368,8,4,1,1,1,4,1,success,pass
1,SHIP4946372,8,4,1,1,1,4,1,success,pass


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success
75,SHIP4954744,8,4,1,1,1,8,1,duplicated jobs,pass
105,SHIP4954745,8,4,1,1,1,8,1,duplicated jobs,pass


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success
97,SHIP5119429,8,5,1,1,1,7,1,duplicated nodes,pass
365,SHIP5142661,8,4,1,1,2,4,1,duplicated nodes,pass


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success
92,SHIP5119443,8,3,0,0,0,4,0,failed fq2u,fail
175,SHIP5141860,8,3,0,0,0,4,0,failed fq2u,fail


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success
226,SHIP5119492,8,4,0,0,0,4,0,no gatk,fail
233,SHIP5141891,12,6,0,0,0,6,0,no gatk,fail


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success
210,SHIP5141905,8,4,0,1,1,4,1,failed gatk,fail
457,SHIP5165752,8,4,0,0,0,4,1,failed gatk,fail


## Generate Status Table

### The number of samples by status and success

In [204]:
stat_status_qc=sample_qc_df['status'].value_counts().to_frame()
stat_status_qc['rate']=100*(stat_status_qc['status']/numsample)
stat_status_qc=stat_status_qc.reindex(index = ['success', 'duplicated jobs', 'duplicated nodes', 'failed fq2u', 'no gatk', 'failed gatk'])
display(stat_status_qc)

pd.set_option('display.float_format', lambda x: '%.2f' % x)
print("Success Rate : " + str(sum(stat_status_qc['rate'][0:3])) + "%")
print("Failed Rate : " + str(sum(stat_status_qc['rate'][3:6])) + "%")

Unnamed: 0,status,rate
success,1285,97
duplicated jobs,15,1
duplicated nodes,8,1
failed fq2u,15,1
no gatk,3,0
failed gatk,4,0


Success Rate : 98.34586466165413%
Failed Rate : 1.6541353383458646%


## Check the relationship informaiton and dstat message of successful samples

### fq2u issued relationship of successful samples

In [205]:
success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
print("The number of success samples: " + str(len(success_sample_df)))
success_sample=success_sample_df['sample'].to_list()

query = "Match (:Fastq)-[:INPUT_TO]->(j:Job:Dsub) WHERE not (j)-[:OUTPUT]->(:Ubam) and j.sample IN {} RETURN distinct j.sample AS sample_no_fq2utoubam".format(str(success_sample))
norelation_ubam = graph.run(query).to_data_frame()
print("The number of samples without the relationship between fq2u and ubam: " + str(len(norelation_ubam)))
print("\n")

display(norelation_ubam.head(3))

The number of success samples: 1285
The number of samples without the relationship between fq2u and ubam: 3




Unnamed: 0,sample_no_fq2utoubam
0,SHIP5119430
1,SHIP5153310
2,SHIP5185260


### gatk issued relationship of successful samples

In [206]:
# success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
# print("The number of success samples: " + str(len(success_sample_df)))
# success_sample=norelation_sample_df['sample'].to_list()

query = "Match (j:Job:Cromewell) WHERE (not (:Ubam)-[:INPUT_TO]->(j) or not (j)-[:OUTPUT]->(:Merged:Vcf) or not (j)-[:OUTPUT]->(:Cram) or not (j)-[:OUTPUT]->(:Crai)) and (j.sample IN {}) RETURN distinct j.sample AS sample_missed_output".format(str(success_sample))
norelation_output = graph.run(query).to_data_frame()
print("The number of samples without the relationship between gatk and outputs: " + str(len(norelation_output)))

display(norelation_output.head(3))
#display(set(norelation_sample)-set(norelation_dstat['sample']))

The number of samples without the relationship between gatk and outputs: 0


### Issued fq2u dstat message in successful samples

In [207]:
# success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
# print("The number of success samples: " + str(len(success_sample_df)))
# success_sample=norelation_sample_df['sample'].to_list()

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.readGroup As RG, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(success_sample))
issued_fq2u_dstat = graph.run(query).to_data_frame()
print("The number of samples with 'FAILURE' as dstat message  : " + str(len(issued_fq2u_dstat)))

if(len(issued_fq2u_dstat)):
    ## Issued Sample List
    display(issued_fq2u_dstat.head(3))

    ## Classified by Messange
    display(issued_fq2u_dstat.groupby(['Message','status']).count())

The number of samples with 'FAILURE' as dstat message  : 3


Unnamed: 0,Message,RG,log,sample,status
0,ServiceException: 401 Anonymous caller does no...,2,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5169288,FAILURE
1,ServiceException: 401 Anonymous caller does no...,2,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5153375,FAILURE
2,ServiceException: 401 Anonymous caller does no...,1,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5169413,FAILURE


Unnamed: 0_level_0,Unnamed: 1_level_0,RG,log,sample
Message,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\n,FAILURE,3,3,3


### Issued gatk dstat message in successful samples

In [208]:
# success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
# print("The number of success samples: " + str(len(success_sample_df)))
# success_sample=norelation_sample_df['sample'].to_list()

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(success_sample))
issued_gatk_dstat = graph.run(query).to_data_frame()
print("The number of samples with 'FAILURE' as dstat message  : " + str(len(issued_gatk_dstat)))


if(len(issued_gatk_dstat)):
    ## Issued Sample List
    display(issued_gatk_dstat.head(3))
    
    ## Classified by Message
    display(issued_gatk_dstat.groupby(['Message','status']).count())
    

The number of samples with 'FAILURE' as dstat message  : 0


### Add relationship information and issued dstat message

In [210]:
pd.set_option('display.float_format', lambda x: '%.f' % x)

success_re_dstat_df=pd.DataFrame()
success_re_dstat_df['sample']=success_sample_df['sample']

## Add the column in 'success_re_dstat_df' with input and output connections of fq2u jobs
try:
  success_re_dstat_df.loc[success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam']),'re_fq2u']="missing"
  success_re_dstat_df.loc[~success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam']),'re_fq2u']="all"
except:
  print("No missing input and output connections of fq2u jobs")
  success_re_dstat_df.loc[:,'re_fq2u']="all"

## Add the column in 'success_re_dstat_df' with input and output connections of gatk jobs
try:
  success_re_dstat_df.loc[success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="missing"
  success_re_dstat_df.loc[~success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="all"
except:
  print("No missing input and output connections of gatk jobs")
  success_re_dstat_df.loc[:,'re_gatk']="all"


## Add the column in 'success_re_dstat_df' with dstat message

try:
  issued_fq2u_dstat_df=issued_fq2u_dstat[['sample','RG','Message','log']]
  issued_fq2u_dstat_df.rename(columns={'RG': 'dstat_job', 'Message': 'dstat_msg'}, inplace=True)
  success_re_dstat_df=success_re_dstat_df.merge(issued_fq2u_dstat_df, on='sample', how='left')
except:
  print("No issued dstat message of fq2u jobs")

try:
  issued_gatk_dstat_df=issued_gatk_dstat[['sample','Message','log']]
  issued_gatk_dstat_df.rename(columns={'Message': 'dstat_msg'}, inplace=True)
  issued_gatk_dstat_df.loc[:,'dstat_job']="GATK"
  success_re_dstat_df=success_re_dstat_df.merge(issued_gatk_dstat_df, on='sample', how='left')
except:
  print("No issued dstat message of gatk jobs")

display(success_re_dstat_df.head(3))
display(success_re_dstat_df[success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam'])])
display(success_re_dstat_df[success_re_dstat_df['sample'].isin(issued_fq2u_dstat['sample'])])


sample_qc_df=sample_qc_df.merge(success_re_dstat_df, on='sample', how='left')
display(sample_qc_df.head(3))
display(sample_qc_df[sample_qc_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam'])])
display(sample_qc_df[sample_qc_df['sample'].isin(issued_fq2u_dstat['sample'])])

No missing input and output connections of gatk jobs
No issued dstat message of gatk jobs


Unnamed: 0,sample,re_fq2u,re_gatk,dstat_job,dstat_msg,log
0,SHIP4946368,all,all,,,
1,SHIP4946372,all,all,,,
2,SHIP4948530,all,all,,,


Unnamed: 0,sample,re_fq2u,re_gatk,dstat_job,dstat_msg,log
87,SHIP5119430,missing,all,,,
603,SHIP5153310,missing,all,,,
1206,SHIP5185260,missing,all,,,


Unnamed: 0,sample,re_fq2u,re_gatk,dstat_job,dstat_msg,log
654,SHIP5153375,all,all,2,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...
943,SHIP5169288,all,all,2,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...
1057,SHIP5169413,all,all,1,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success,re_fq2u,re_gatk,dstat_job,dstat_msg,log
0,SHIP4946368,8,4,1,1,1,4,1,success,pass,all,all,,,
1,SHIP4946372,8,4,1,1,1,4,1,success,pass,all,all,,,
2,SHIP4948530,8,4,1,1,1,4,1,success,pass,all,all,,,


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success,re_fq2u,re_gatk,dstat_job,dstat_msg,log
88,SHIP5119430,8,4,1,1,1,4,1,success,pass,missing,all,,,
624,SHIP5153310,8,4,1,1,1,4,1,success,pass,missing,all,,,
1240,SHIP5185260,8,4,1,1,1,4,1,success,pass,missing,all,,,


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success,re_fq2u,re_gatk,dstat_job,dstat_msg,log
676,SHIP5153375,8,4,1,1,1,4,1,success,pass,all,all,2,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...
968,SHIP5169288,8,4,1,1,1,4,1,success,pass,all,all,2,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...
1085,SHIP5169413,8,4,1,1,1,4,1,success,pass,all,all,1,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...


## Check dstat message of samples with duplicated jobs and nodes

### Duplicated Jobs (Fq2u)

In [140]:
dupjobs_sample_df = sample_qc_df[sample_qc_df['status']=='duplicated jobs']
print("The number of success samples with duplicated jobs: " + str(len(dupjobs_sample_df)))
dupjobs_sample=dupjobs_sample_df['sample'].to_list()

#display(dupjobs_sample_df)

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjobs_sample))
dupfq2u_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(dupfq2u_dstat)))

## Issued Sample and Job List
display(dupfq2u_dstat.head())

## Classified by Messange
dupfq2u_dstat.groupby(['Message','status']).count()

The number of success samples with duplicated jobs: 15
The number of duplicated fq2u jobs : 47


Unnamed: 0,Message,log,sample,status
0,Stopped running 'prepare',gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP4954745,RUNNING
1,Worker 'google-pipelines-worker-6b640db5e96c87...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP4954745,RUNNING
2,worker was terminated,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP4954744,FAILURE
3,worker was terminated,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP4954745,FAILURE
4,worker was terminated,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5142424,FAILURE


Unnamed: 0_level_0,Unnamed: 1_level_0,log,sample
Message,status,Unnamed: 2_level_1,Unnamed: 3_level_1
Started pulling 'gcr.io/gbsc-gcp-project-mvp-test/broadinstitute/gatk:4.1.0.0',RUNNING,13,13
Started running 'localization',RUNNING,15,15
Started running 'prepare',RUNNING,2,2
Started running 'user-command',RUNNING,1,1
Stopped pulling 'gcr.io/gbsc-gcp-project-mvp-test/broadinstitute/gatk:4.1.0.0',RUNNING,4,4
Stopped running 'prepare',RUNNING,1,1
Worker 'google-pipelines-worker-115ef3db30e81d0fe0ae9ac02676b21b' assigned in 'us-west1-a',RUNNING,1,1
Worker 'google-pipelines-worker-6b640db5e96c874bc90e6972f27ef0ae' assigned in 'us-west1-c',RUNNING,1,1
Worker 'google-pipelines-worker-882a96720b399174d1205054b74f7557' assigned in 'us-west1-a',RUNNING,1,1
Worker 'google-pipelines-worker-c30f07ea065aebd51e7e2499b3023d6b' assigned in 'us-west1-a',RUNNING,1,1


### Duplicated Jobs (GATK)

In [141]:
#dupjobs_sample_df = sample_qc_df[sample_qc_df['status']=='duplicated jobs']
print("The number of success samples with duplicated jobs: " + str(len(dupjobs_sample_df)))
#dupjobs_sample=dupjobs_sample_df['sample'].to_list()

#display(dupjobs_sample_df)

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjobs_sample))
dupgatk_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(dupgatk_dstat)))
display(dupgatk_dstat.head(3))

dupgatk_dstat.groupby(['Message','status']).count()

The number of success samples with duplicated jobs: 15
The number of duplicated fq2u jobs : 1


Unnamed: 0,Message,log,sample,status
0,Started running 'user-command',gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5141855,RUNNING


Unnamed: 0_level_0,Unnamed: 1_level_0,log,sample
Message,status,Unnamed: 2_level_1,Unnamed: 3_level_1
Started running 'user-command',RUNNING,1,1


### Duplicated Job and Node

In [142]:
dupnodes_sample_df = sample_qc_df[sample_qc_df['status']=='duplicated nodes']
print("The number of success samples with duplicated nodes: " + str(len(dupnodes_sample_df)))
dupnodes_sample=dupnodes_sample_df['sample'].to_list()

display(dupnodes_sample_df)

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupnodes_sample))
dupnodes_dstat = graph.run(query).to_data_frame()
print("The number of duplicated nodes: " + str(len(dupnodes_dstat)))
display(dupnodes_dstat.head(3))

#dupjob_dstat.groupby(['Message','status']).count()

The number of success samples with duplicated nodes: 8


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,status,success
97,SHIP5119429,8,5,1.0,1.0,1.0,7,1.0,duplicated nodes,pass
365,SHIP5142661,8,4,1.0,1.0,2.0,4,1.0,duplicated nodes,pass
368,SHIP5142468,8,4,1.0,2.0,1.0,4,1.0,duplicated nodes,pass
633,SHIP5153353,8,4,1.0,1.0,2.0,4,1.0,duplicated nodes,pass
1050,SHIP5169373,8,4,1.0,1.0,2.0,4,1.0,duplicated nodes,pass
1248,SHIP5185273,8,4,1.0,2.0,2.0,4,1.0,duplicated nodes,pass
1313,SHIP5185330,8,4,2.0,1.0,1.0,4,1.0,duplicated nodes,pass
1319,SHIP5185343,8,4,1.0,2.0,1.0,4,1.0,duplicated nodes,pass


The number of duplicated nodes: 0


## Check the log list and the dstat message of issued jobs

### Failed Fq2u

In [189]:
failed_fq2u_sample_df = sample_qc_df[sample_qc_df['status']=='failed fq2u']
print("The number of samples with failed fq2u jobs: " + str(len(failed_fq2u_sample_df)))
failed_fq2u_sample=failed_fq2u_sample_df['sample'].to_list()

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.readGroup AS RG, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(failed_fq2u_sample))
failed_fq2u_dstat = graph.run(query).to_data_frame()
print("The number of failed fq2u jobs : " + str(len(failed_fq2u_dstat)))
display(failed_fq2u_dstat.head(3))

failed_fq2u_dstat.groupby('Message').count()

The number of samples with failed fq2u jobs: 15
The number of failed fq2u jobs : 15


Unnamed: 0,Message,RG,log,sample,status
0,to gbsc-gcp-project-mvp-test-from-personalis-...,0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5119443,FAILURE
1,ServiceException: 401 Anonymous caller does no...,3,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5141860,FAILURE
2,ServiceException: 401 Anonymous caller does no...,1,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5142426,FAILURE


Unnamed: 0_level_0,RG,log,sample,status
Message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\n,1,1,1,1
ServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\n,14,14,14,14


### Failed GATK

In [144]:
failed_gatk_sample_df = sample_qc_df[sample_qc_df['status']=='failed gatk']
print("The number of samples with failed gatk jobs: " + str(len(failed_gatk_sample_df)))
failed_gatk_sample=failed_gatk_sample_df['sample'].to_list()

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(failed_gatk_sample))
failed_gatk_dstat = graph.run(query).to_data_frame()
print("The number of duplicated gatk jobs : " + str(len(failed_gatk_dstat)))
display(failed_gatk_dstat.head(3))

failed_gatk_dstat.groupby('Message').count()

The number of samples with failed gatk jobs: 4
The number of duplicated gatk jobs : 4


Unnamed: 0,Message,log,sample,status
0,The assigned worker has failed to complete the...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5141905,FAILURE
1,Workflow 467872d3-5a11-4f6d-a25d-f9f90e61b72d ...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5165752,FAILURE
2,Workflow e75a10d0-1e1c-4523-840b-982cea8e506c ...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5153220,FAILURE


Unnamed: 0_level_0,log,sample,status
Message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The assigned worker has failed to complete the operation,1,1,1
Workflow 467872d3-5a11-4f6d-a25d-f9f90e61b72d transitioned to state Failed\n,1,1,1
Workflow 95c7e923-6249-4b25-a269-05984f4069af transitioned to state Failed\n,1,1,1
Workflow e75a10d0-1e1c-4523-840b-982cea8e506c transitioned to state Failed\n,1,1,1


### Add failed dstat message

In [195]:
pd.set_option('display.float_format', lambda x: '%.f' % x)

failed_re_dstat_df=pd.DataFrame()
df1=failed_fq2u_dstat[['sample','RG','Message','log']]
df2=failed_fq2u_dstat[['sample','Message','log']]
failed_re_dstat_df=pd.concat([df1,df2])

failed_re_dstat_df.rename(columns={'RG': 'dstat_job', 'Message': 'dstat_msg'}, inplace=True)

## Add the column in 'failed_re_dstat_df' with failed fq2u jobs
# try:
#   failed_re_dstat_df.loc[failed_re_dstat_df['sample'].isin(failed_fq2u_sample),'re_fq2u']="missing"
#   failed_re_dstat_df.loc[~failed_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam']),'re_fq2u']="all"
# except:
#   print("No missing input and output connections of fq2u jobs")
#   success_re_dstat_df.loc[:,'re_fq2u']="all"

# ## Add the column in 'success_re_dstat_df' with input and output connections of gatk jobs
# try:
#   success_re_dstat_df.loc[success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="missing"
#   success_re_dstat_df.loc[~success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="all"
# except:
#   print("No missing input and output connections of gatk jobs")
#   success_re_dstat_df.loc[:,'re_gatk']="all"


# ## Add the column in 'success_re_dstat_df' with dstat message

# try:
#   issued_fq2u_dstat_df=issued_fq2u_dstat[['sample','RG','Message','log']]
#   issued_fq2u_dstat_df.rename(columns={'RG': 'dstat_job', 'Message': 'dstat_msg'}, inplace=True)
#   success_re_dstat_df=success_re_dstat_df.merge(issued_fq2u_dstat_df, on='sample', how='left')
# except:
#   print("No issued dstat message of fq2u jobs")

# try:
#   issued_gatk_dstat_df=issued_gatk_dstat[['sample','Message','log']]
#   issued_gatk_dstat_df.rename(columns={'Message': 'dstat_msg'}, inplace=True)
#   issued_gatk_dstat_df.loc[:,'dstat_job']="GATK"
#   success_re_dstat_df=success_re_dstat_df.merge(issued_gatk_dstat_df, on='sample', how='left')
# except:
#   print("No issued dstat message of gatk jobs")

display(failed_re_dstat_df)
# display(success_re_dstat_df[success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam'])])
# display(success_re_dstat_df[success_re_dstat_df['sample'].isin(issued_fq2u_dstat['sample'])])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,dstat_msg,dstat_job,log,sample
0,to gbsc-gcp-project-mvp-test-from-personalis-...,0.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5119443
1,ServiceException: 401 Anonymous caller does no...,3.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5141860
2,ServiceException: 401 Anonymous caller does no...,1.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5142426
3,ServiceException: 401 Anonymous caller does no...,1.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5142623
4,ServiceException: 401 Anonymous caller does no...,1.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5132721
5,ServiceException: 401 Anonymous caller does no...,3.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5142647
6,ServiceException: 401 Anonymous caller does no...,2.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5153233
7,ServiceException: 401 Anonymous caller does no...,0.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5183667
8,ServiceException: 401 Anonymous caller does no...,2.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5183694
9,ServiceException: 401 Anonymous caller does no...,0.0,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5165713
