Neo4J v053 Test2 - Sample Based Analysis
================

## Set up the environment


### Install py2neo for querying Neo4J 

In [124]:
#!pip3 install py2neo

### Import Packages

In [125]:
from py2neo import Graph
import pandas as pd
import numpy as np
import subprocess
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

#plt.style.use('fivethirtyeight')
#plt.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.f' % x)

### Load Neo4J DB

In [126]:
graph = Graph("bolt://35.230.80.57:7687", auth=("neo4j", "bZjcc1XTd71ft2oVGj7A8aU8UkttdBhRAecV4x6USC3wpeOLmCmfCAH2bAMllQQlwtzCbzLuX1TgoX-Enc7MUA"))

## Check the number of nodes (Fastq, Ubam, Vcf, Cram, Crai) and jobs (FQ2U, GATK)

### The number of nodes (Fastq, Ubam, Vcf, Cram, Crai) 

In [127]:
# Fastq
query = "Match (j:Json:FromPersonalis), (f:Fastq) WHERE f.sample = j.sample RETURN j.sample AS sample, count(f) AS fastq"
num_fastq = graph.run(query).to_data_frame()
num_fastq.set_index('sample')

# Ubam
query = "Match (j:Json:FromPersonalis), (u:Ubam) WHERE u.sample = j.sample RETURN DISTINCT j.sample AS sample, count(u) AS ubam"
num_ubam = graph.run(query).to_data_frame()
num_ubam.set_index('sample')

# Vcf
query = "Match (j:Json:FromPersonalis), (v:Merged:Vcf) WHERE v.sample = j.sample RETURN DISTINCT j.sample AS sample, count(v) AS vcf"
num_vcf = graph.run(query).to_data_frame()
num_vcf.set_index('sample')

# Cram
query = "Match (j:Json:FromPersonalis), (cm:Cram) WHERE cm.sample = j.sample RETURN DISTINCT j.sample AS sample, count(cm) AS cram"
num_cram = graph.run(query).to_data_frame()
num_cram.set_index('sample')

# Crai
query = "Match (j:Json:FromPersonalis), (ci:Crai) WHERE ci.sample = j.sample RETURN DISTINCT j.sample AS sample, count(ci) AS crai"
num_crai = graph.run(query).to_data_frame()
num_crai.set_index('sample')
print("")




### The number of jobs (FQ2U, GATK) 

In [128]:
# Fq2u
query = "Match (j:Json:FromPersonalis), (e:Job:Dsub {name:'fastq-to-ubam'}) WHERE e.sample = j.sample RETURN j.sample AS sample, count(e) AS fq2u"
num_fq2u = graph.run(query).to_data_frame()
num_fq2u.set_index('sample')

# Gatk
query = "Match (j:Json:FromPersonalis), (g:Job:Cromwell) WHERE g.sample = j.sample RETURN j.sample AS sample, count(g) AS gatk"
num_gatk = graph.run(query).to_data_frame()
num_gatk.set_index('sample')
print("")




### Merge all node and job dfs to one df

In [129]:
#-- only nodes
#sample_qc_df=num_fastq.merge(num_ubam, how='outer').merge(num_vcf, how='outer').merge(num_cram, how='outer').merge(num_crai, how='outer')
#columnlist=["sample","fastq","ubam","vcf","cram","crai"]

#-- nodes and jobs
sample_qc_df=num_fastq.merge(num_ubam, how='outer').merge(num_vcf, how='outer').merge(num_cram, how='outer').merge(num_crai, how='outer').merge(num_fq2u,how='outer').merge(num_gatk,how='outer')
columnlist=["sample","fastq","ubam","vcf","cram","crai","fq2u","gatk"]
numsample=len(sample_qc_df)
print("The number of samples : " + str(numsample) + "\n")

sample_qc_df=sample_qc_df[columnlist]
sample_qc_df.fillna(0,inplace=True)
#sample_qc_df.head(3)

The number of samples : 1330



### Extract issued samples (ideal)

In [130]:
#-- ideal node test
#issued_sample_qc_df = sample_qc_df[(sample_qc_df['ubam']!=sample_qc_df['fastq']/2)|(sample_qc_df['vcf']!=1)|(sample_qc_df['cram']!=1)|(sample_qc_df['crai']!=1)]
#numissuedsample=len(issued_sample_qc_df)

#-- ideal node and job test
issued_sample_qc_df = sample_qc_df[(sample_qc_df['ubam']!=sample_qc_df['fastq']/2)|(sample_qc_df['vcf']!=1)|(sample_qc_df['cram']!=1) \
                                   |(sample_qc_df['crai']!=1)|(sample_qc_df['fq2u']!=sample_qc_df['fastq']/2)|(sample_qc_df['gatk']!=1)]
numissuedsample=len(issued_sample_qc_df)

print("The number of issued samples : " + str(numissuedsample) + "\n")
print("Success Rate : " + str((numsample-numissuedsample)/numsample) + "\n")

#issued_sample_qc_df.head(3)

The number of issued samples : 45

Success Rate : 0.9661654135338346



### Extract issued samples (except the duplicated jobs)

In [131]:
#-- ideal node test
#issued_sample_qc_df = sample_qc_df[(sample_qc_df['ubam']!=sample_qc_df['fastq']/2)|(sample_qc_df['vcf']!=1)|(sample_qc_df['cram']!=1)|(sample_qc_df['crai']!=1)]
#numissuedsample=len(issued_sample_qc_df)

#-- ideal node and job test
issued_sample_dupj_qc_df = sample_qc_df[(sample_qc_df['ubam']!=sample_qc_df['fastq']/2)|(sample_qc_df['vcf']!=1)|(sample_qc_df['cram']!=1) \
                                   |(sample_qc_df['crai']!=1)|(sample_qc_df['fq2u']<sample_qc_df['fastq']/2)|(sample_qc_df['gatk']<1)]
numissueddupjsample=len(issued_sample_dupj_qc_df)

print("The number of issued samples except the duplicated jobs : " + str(numissueddupjsample) + "\n")
print("Success Rate including the duplicated jobs : " + str((numsample-numissueddupjsample)/numsample) + "\n")

#issued_sample_dupj_qc_df.head(3)

The number of issued samples except the duplicated jobs : 30

Success Rate including the duplicated jobs : 0.9774436090225563



### Extract issued samples (except the duplicated nodes and jobs)

In [132]:
# ideal test
issued_sample_dup_qc_df = sample_qc_df[(sample_qc_df['ubam']<(sample_qc_df['fastq']/2))|(sample_qc_df['vcf']<1)|(sample_qc_df['cram']<1)|(sample_qc_df['crai']<1) \
                                      |(sample_qc_df['crai']<1)|(sample_qc_df['fq2u']<(sample_qc_df['fastq']/2))|(sample_qc_df['gatk']<1)]
numissueddupsample=len(issued_sample_dup_qc_df)

## keep this order ----
issued_sample_dup_qc_df.loc[((issued_sample_dup_qc_df['vcf'] < 1) | (issued_sample_dup_qc_df['cram'] < 1) | (issued_sample_dup_qc_df['crai'] < 1)) & (issued_sample_dup_qc_df['gatk']==1),'issue']="failed gatk"
issued_sample_dup_qc_df.loc[((issued_sample_dup_qc_df['vcf'] < 1) | (issued_sample_dup_qc_df['cram'] < 1) | (issued_sample_dup_qc_df['crai'] < 1)) & (issued_sample_dup_qc_df['gatk']==0),'issue']="no gatk"

issued_sample_dup_qc_df.loc[(issued_sample_dup_qc_df['ubam']<issued_sample_dup_qc_df['fastq']/2) & (issued_sample_dup_qc_df['fq2u']==issued_sample_dup_qc_df['fastq']/2),'issue']="failed fq2u"
issued_sample_dup_qc_df.loc[(issued_sample_dup_qc_df['ubam']<issued_sample_dup_qc_df['fastq']/2) & (issued_sample_dup_qc_df['fq2u']<issued_sample_dup_qc_df['fastq']/2),'issue']="no fq2u"
## --------------------

print("The number of issued samples except duplication issues : " + str(numissueddupsample) + "\n")
print("Success Rate including the duplicated jobs and notes: " + str((numsample-numissueddupsample)/numsample) + "\n")

print("The number of samples with faied fq2u jobs : " + str(len(issued_sample_dup_qc_df[issued_sample_dup_qc_df['issue']=='failed fq2u'])))
print("The number of samples without fq2u jobs : " + str(len(issued_sample_dup_qc_df[issued_sample_dup_qc_df['issue']=='no fq2u'])))
print("The number of samples with faied gatk jobs : " + str(len(issued_sample_dup_qc_df[issued_sample_dup_qc_df['issue']=='failed gatk'])))
print("The number of samples without gatk jobs : " + str(len(issued_sample_dup_qc_df[issued_sample_dup_qc_df['issue']=='no gatk'])))

issued_sample_dup_qc_df.head(25)

The number of issued samples except duplication issues : 22

Success Rate including the duplicated jobs and notes: 0.9834586466165414

The number of samples with faied fq2u jobs : 15
The number of samples without fq2u jobs : 0
The number of samples with faied gatk jobs : 4
The number of samples without gatk jobs : 3


Unnamed: 0,sample,fastq,ubam,vcf,cram,crai,fq2u,gatk,issue
92,SHIP5119443,8,3,0,0,0,4,0,failed fq2u
175,SHIP5141860,8,3,0,0,0,4,0,failed fq2u
210,SHIP5141905,8,4,0,1,1,4,1,failed gatk
226,SHIP5119492,8,4,0,0,0,4,0,no gatk
233,SHIP5141891,12,6,0,0,0,6,0,no gatk
272,SHIP5142426,8,3,0,0,0,4,0,failed fq2u
392,SHIP5132721,8,3,0,0,0,4,0,failed fq2u
405,SHIP5142623,8,3,0,0,0,4,0,failed fq2u
430,SHIP5142647,8,3,0,0,0,4,0,failed fq2u
457,SHIP5165752,8,4,0,0,0,4,1,failed gatk


## Check the log list and the dstat message of issued jobs

### Failed Fq2u

In [145]:
failed_fq2u=issued_sample_dup_qc_df[issued_sample_dup_qc_df['issue']=="failed fq2u"]
failed_fq2u_sample=failed_fq2u['sample'].to_list()

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(failed_fq2u_sample))
failed_fq2u_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(failed_fq2u_dstat)))
display(failed_fq2u_dstat.head(3))

failed_fq2u_dstat.groupby('Message').count()

The number of duplicated fq2u jobs : 15


Unnamed: 0,Message,log,sample,status
0,to gbsc-gcp-project-mvp-test-from-personalis-...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5119443,FAILURE
1,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5141860,FAILURE
2,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5142426,FAILURE


Unnamed: 0_level_0,log,sample,status
Message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\n,1,1,1
ServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\n,14,14,14


### Failed GATK

In [134]:
failed_gatk=issued_sample_dup_qc_df[issued_sample_dup_qc_df['issue']=="failed gatk"]
failed_gatk_sample=failed_gatk['sample'].to_list()

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(failed_gatk_sample))
failed_gatk_dstat = graph.run(query).to_data_frame()
print("The number of duplicated gatk jobs : " + str(len(failed_gatk_dstat)))
display(failed_gatk_dstat.head(3))

failed_gatk_dstat.groupby('Message').count()

The number of duplicated gatk jobs : 4


Unnamed: 0,Message,log,sample,status
0,The assigned worker has failed to complete the...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5141905,FAILURE
1,Workflow 467872d3-5a11-4f6d-a25d-f9f90e61b72d ...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5165752,FAILURE
2,Workflow e75a10d0-1e1c-4523-840b-982cea8e506c ...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5153220,FAILURE


Unnamed: 0_level_0,log,sample,status
Message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The assigned worker has failed to complete the operation,1,1,1
Workflow 467872d3-5a11-4f6d-a25d-f9f90e61b72d transitioned to state Failed\n,1,1,1
Workflow 95c7e923-6249-4b25-a269-05984f4069af transitioned to state Failed\n,1,1,1
Workflow e75a10d0-1e1c-4523-840b-982cea8e506c transitioned to state Failed\n,1,1,1


### Duplicated Jobs (GATK)

In [135]:
dupjob_sample_df = sample_qc_df[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&(sample_qc_df['fq2u']==sample_qc_df['fastq']/2)&(sample_qc_df['gatk']>1)]
dupjob_sample=dupjob_sample_df['sample'].to_list()

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjob_sample))
dupjob_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(dupjob_dstat)))
display(dupjob_dstat.head(3))

dupjob_dstat.groupby(['Message','status']).count()

The number of duplicated fq2u jobs : 1


Unnamed: 0,Message,log,sample,status
0,Started running 'user-command',gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5141855,RUNNING


Unnamed: 0_level_0,Unnamed: 1_level_0,log,sample
Message,status,Unnamed: 2_level_1,Unnamed: 3_level_1
Started running 'user-command',RUNNING,1,1


### Duplicated Jobs (Fq2u)

In [144]:
dupjob_sample_df = sample_qc_df[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&(sample_qc_df['fq2u']>sample_qc_df['fastq']/2)&(sample_qc_df['gatk']>0)]
dupjob_sample=dupjob_sample_df['sample'].to_list()

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjob_sample))
dupjob_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(dupjob_dstat)))
display(dupjob_sample)
display(dupjob_dstat.head(3))

dupjob_dstat.groupby(['Message','status']).count()

The number of duplicated fq2u jobs : 46


['SHIP4954744',
 'SHIP4954745',
 'SHIP5142424',
 'SHIP5142433',
 'SHIP5153200',
 'SHIP5183799',
 'SHIP5185263',
 'SHIP5185268',
 'SHIP5185271',
 'SHIP5185278',
 'SHIP5185301',
 'SHIP5185335',
 'SHIP5185350',
 'SHIP5185351']

Unnamed: 0,Message,log,sample,status
0,Stopped running 'prepare',gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP4954745,RUNNING
1,Worker 'google-pipelines-worker-6b640db5e96c87...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP4954745,RUNNING
2,worker was terminated,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP4954744,FAILURE


Unnamed: 0_level_0,Unnamed: 1_level_0,log,sample
Message,status,Unnamed: 2_level_1,Unnamed: 3_level_1
Started pulling 'gcr.io/gbsc-gcp-project-mvp-test/broadinstitute/gatk:4.1.0.0',RUNNING,13,13
Started running 'localization',RUNNING,15,15
Started running 'prepare',RUNNING,2,2
Stopped pulling 'gcr.io/gbsc-gcp-project-mvp-test/broadinstitute/gatk:4.1.0.0',RUNNING,4,4
Stopped running 'prepare',RUNNING,1,1
Worker 'google-pipelines-worker-115ef3db30e81d0fe0ae9ac02676b21b' assigned in 'us-west1-a',RUNNING,1,1
Worker 'google-pipelines-worker-6b640db5e96c874bc90e6972f27ef0ae' assigned in 'us-west1-c',RUNNING,1,1
Worker 'google-pipelines-worker-882a96720b399174d1205054b74f7557' assigned in 'us-west1-a',RUNNING,1,1
Worker 'google-pipelines-worker-c30f07ea065aebd51e7e2499b3023d6b' assigned in 'us-west1-a',RUNNING,1,1
Worker 'google-pipelines-worker-db80acc6901b7f63ada6885d38937d65' assigned in 'us-west1-c',RUNNING,1,1


### Duplicated Job and Node

In [136]:
dupjob_sample_df = sample_qc_df[((sample_qc_df['ubam']>sample_qc_df['fastq']/2)|(sample_qc_df['vcf']>1)|(sample_qc_df['cram']>1) \
                                   |(sample_qc_df['crai']>1))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)|(sample_qc_df['gatk']>=1))]
dupjob_sample=dupjob_sample_df['sample'].to_list()

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjob_sample))
dupjob_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(dupjob_dstat)))
display(dupjob_dstat.head(3))

#dupjob_dstat.groupby(['Message','status']).count()

The number of duplicated fq2u jobs : 0


### Issues Jobs in successful samples

In [138]:
dupjob_sample_df = sample_qc_df[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&(sample_qc_df['fq2u']==sample_qc_df['fastq']/2)&(sample_qc_df['gatk']==1)]
dupjob_sample=dupjob_sample_df['sample'].to_list()

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjob_sample))
dupjob_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(dupjob_dstat)))
display(dupjob_dstat.head(3))

dupjob_dstat.groupby(['Message','status']).count()

The number of duplicated fq2u jobs : 3


Unnamed: 0,Message,log,sample,status
0,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5169288,FAILURE
1,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5153375,FAILURE
2,ServiceException: 401 Anonymous caller does no...,gs://gbsc-gcp-project-mvp-test-from-personalis...,SHIP5169413,FAILURE


Unnamed: 0_level_0,Unnamed: 1_level_0,log,sample
Message,status,Unnamed: 2_level_1,Unnamed: 3_level_1
ServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\nServiceException: 401 Anonymous caller does not have storage.objects.list access to gbsc-gcp-project-mvp-test-from-personalis-wgs35-logs.\n,FAILURE,3,3


## Check the relationship informaiton of successful samples

### fq2u issued relationship of successful samples

In [139]:
norelation_sample_df = sample_qc_df[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&(sample_qc_df['fq2u']==sample_qc_df['fastq']/2)&(sample_qc_df['gatk']==1)]
norelation_sample=dupjob_sample_df['sample'].to_list()
display(len(norelation_sample))

query = "Match (:Fastq)-[:INPUT_TO]->(j:Job:Dsub) WHERE not (j)-[:OUTPUT]->(:Ubam) and j.sample IN {} RETURN distinct j.sample AS sample_no_fq2utoubam".format(str(norelation_sample))
norelation_ubam = graph.run(query).to_data_frame()
print("The number of samples without the relationship between fq2u and ubam: " + str(len(norelation_ubam)))

display(norelation_ubam.head(3))
#display(set(norelation_sample)-set(norelation_dstat['sample']))

1285

The number of samples without the relationship between fq2u and ubam: 3


Unnamed: 0,sample_no_fq2utoubam
0,SHIP5119430
1,SHIP5153310
2,SHIP5185260


### gatk issued relationship of successful samples

In [140]:
norelation_sample_df = sample_qc_df[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&(sample_qc_df['fq2u']==sample_qc_df['fastq']/2)&(sample_qc_df['gatk']==1)]
norelation_sample=dupjob_sample_df['sample'].to_list()
display(len(norelation_sample))

query = "Match (j:Job:Cromewell) WHERE (not (:Ubam)-[:INPUT_TO]->(j) or not (j)-[:OUTPUT]->(:Merged:Vcf) or not (j)-[:OUTPUT]->(:Cram) or not (j)-[:OUTPUT]->(:Crai)) and (j.sample IN {}) RETURN distinct j.sample AS sample_missed_output".format(str(norelation_sample))
norelation_output = graph.run(query).to_data_frame()
print("The number of samples without the relationship between gatk and outputs: " + str(len(norelation_output)))

display(norelation_output.head(3))
#display(set(norelation_sample)-set(norelation_dstat['sample']))

1285

The number of samples without the relationship between gatk and outputs: 0
