Neo4J v054 - Sample Based Analysis
================

## Set up the environment


### Install py2neo for querying Neo4J 

In [1]:
#!pip3 install py2neo

### Import Packages

In [2]:
from py2neo import Graph
import pandas as pd
import numpy as np
import subprocess
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

#plt.style.use('fivethirtyeight')
#plt.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Load Neo4J DB

In [3]:
graph = Graph("bolt://10.138.0.53:7687", auth=("neo4j", "bZjcc1XTd71ft2oVGj7A8aU8UkttdBhRAecV4x6USC3wpeOLmCmfCAH2bAMllQQlwtzCbzLuX1TgoX-Enc7MUA"))
#graph = Graph("bolt://34.82.40.181:7687", auth=("neo4j", "bZjcc1XTd71ft2oVGj7A8aU8UkttdBhRAecV4x6USC3wpeOLmCmfCAH2bAMllQQlwtzCbzLuX1TgoX-Enc7MUA"))

## Generate Sample Status Table

### The number of nodes (Fastq, Ubam, Vcf, Cram, Crai) 

In [4]:
# Fastq
query = "Match (j:Json:FromPersonalis), (f:Fastq) WHERE f.sample = j.sample RETURN j.sample AS sample, count(f) AS fastq"
num_fastq = graph.run(query).to_data_frame()
num_fastq.set_index('sample')

# Ubam
query = "Match (j:Json:FromPersonalis), (u:Ubam) WHERE u.sample = j.sample RETURN DISTINCT j.sample AS sample, count(u) AS ubam"
num_ubam = graph.run(query).to_data_frame()
num_ubam.set_index('sample')

# Vcf
query = "Match (j:Json:FromPersonalis), (v:Merged:Vcf) WHERE v.sample = j.sample RETURN DISTINCT j.sample AS sample, count(v) AS vcf"
num_vcf = graph.run(query).to_data_frame()
num_vcf.set_index('sample')

# Cram
query = "Match (j:Json:FromPersonalis), (cm:Cram) WHERE cm.sample = j.sample RETURN DISTINCT j.sample AS sample, count(cm) AS cram"
num_cram = graph.run(query).to_data_frame()
num_cram.set_index('sample')

# Crai
query = "Match (j:Json:FromPersonalis), (ci:Crai) WHERE ci.sample = j.sample RETURN DISTINCT j.sample AS sample, count(ci) AS crai"
num_crai = graph.run(query).to_data_frame()
num_crai.set_index('sample')
print("")

ServiceUnavailable: Timed out trying to establish connection to ('10.138.0.53', 7687)

### The number of jobs (FQ2U, GATK) 

In [None]:
# Fq2u
query = "Match (j:Json:FromPersonalis), (e:Job:Dsub {name:'fastq-to-ubam'}) WHERE e.sample = j.sample RETURN j.sample AS sample, count(e) AS fq2u"
num_fq2u = graph.run(query).to_data_frame()
num_fq2u.set_index('sample')

# Gatk
query = "Match (j:Json:FromPersonalis), (g:Job:CromwellWorkflow) WHERE g.sample = j.sample RETURN j.sample AS sample, count(g) AS gatk"
num_gatk = graph.run(query).to_data_frame()
num_gatk.set_index('sample')
print("")

### Merge all node and job dfs to one df

In [None]:
#-- only nodes
#sample_qc_df=num_fastq.merge(num_ubam, how='outer').merge(num_vcf, how='outer').merge(num_cram, how='outer').merge(num_crai, how='outer')
#columnlist=["sample","fastq","ubam","vcf","cram","crai"]

#-- nodes and jobs
sample_qc_df=num_fastq.merge(num_ubam, how='outer').merge(num_vcf, how='outer').merge(num_cram, how='outer').merge(num_crai, how='outer').merge(num_fq2u,how='outer').merge(num_gatk,how='outer')
columnlist=["sample","fastq","ubam","vcf","cram","crai","fq2u","gatk"]
numsample=len(sample_qc_df)
print("The number of samples : " + str(numsample) + "\n")

sample_qc_df=sample_qc_df[columnlist]
sample_qc_df.fillna(0,inplace=True)
#sample_qc_df.head(3)

### Classification based on sample status and success.

In [None]:
##-- Passed

#- 1. Successful jobs
sample_qc_df.loc[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&(sample_qc_df['fq2u']==sample_qc_df['fastq']/2)&(sample_qc_df['gatk']==1),'status']="success"
#- 2. Duplicated jobs
sample_qc_df.loc[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==1)&(sample_qc_df['cram']==1) \
                                   &(sample_qc_df['crai']==1)&((sample_qc_df['fq2u']>sample_qc_df['fastq']/2)|(sample_qc_df['gatk']>1)),'status']="duplicated jobs"
#- 3. Duplicated jobs and nodes
sample_qc_df.loc[((sample_qc_df['ubam']>sample_qc_df['fastq']/2)|(sample_qc_df['vcf']>1)|(sample_qc_df['cram']>1) \
                                 |(sample_qc_df['crai']>1))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)|(sample_qc_df['gatk']>=1)),'status']="duplicated nodes"
#- Success 
sample_qc_df.loc[(sample_qc_df['status'].isin(["success", "duplicated jobs", "duplicated nodes"])), 'success'] = "pass"


##-- failed

#- 4. failed fq2u jobs
sample_qc_df.loc[((sample_qc_df['ubam']<sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==0)&(sample_qc_df['cram']==0) \
                                   &(sample_qc_df['crai']==0))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)|(sample_qc_df['gatk']==0)),'status']="failed fq2u"
#- 5. failed gatk jobs
sample_qc_df.loc[(sample_qc_df['ubam']==sample_qc_df['fastq']/2)&((sample_qc_df['vcf']<1)|(sample_qc_df['cram']<1) \
                                   |(sample_qc_df['crai']<1))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)&(sample_qc_df['gatk']>=1)),'status']="failed gatk"
#- 6. no gatk jobs
sample_qc_df.loc[((sample_qc_df['ubam']==sample_qc_df['fastq']/2)&(sample_qc_df['vcf']==0)&(sample_qc_df['cram']==0) \
                                   &(sample_qc_df['crai']==0))&((sample_qc_df['fq2u']>=sample_qc_df['fastq']/2)&(sample_qc_df['gatk']==0)),'status']="no gatk"
#- Fail
sample_qc_df.loc[(sample_qc_df['status'].isin(["failed fq2u", "failed gatk", "no gatk"])), 'success'] = "fail"

##-- Check unclassified samples.
num_unclassified = len(sample_qc_df[sample_qc_df.status.isna()==True])
print("The number of unclassified samples : " + str(num_unclassified)+"\n")

if num_unclassified != 0 :
    display(sample_qc_df[sample_qc_df.status.isna()==True])
    

In [None]:
##-- Display of this table
pd.set_option('display.float_format', lambda x: '%.f' % x)

display(sample_qc_df[sample_qc_df['status']=='success'].head(2))
display(sample_qc_df[sample_qc_df['status']=='duplicated jobs'].head(10))
display(sample_qc_df[sample_qc_df['status']=='duplicated nodes'].head(20))
display(sample_qc_df[sample_qc_df['status']=='failed fq2u'].head(2))
display(sample_qc_df[sample_qc_df['status']=='no gatk'].head(2))
display(sample_qc_df[sample_qc_df['status']=='failed gatk'].head(2))

## Generate Status Table

### The number of samples by status and success

In [None]:
stat_status_qc=sample_qc_df['status'].value_counts().to_frame()
stat_status_qc['rate']=100*(stat_status_qc['status']/numsample)

stat_status_qc=stat_status_qc.reindex(index = ['success', 'duplicated jobs', 'duplicated nodes', 'failed fq2u', 'no gatk', 'failed gatk'])
stat_status_qc=stat_status_qc.replace(np.nan,0)
display(stat_status_qc)

pd.set_option('display.float_format', lambda x: '%.2f' % x)
print("Success Rate : " + str(sum(stat_status_qc['rate'][0:3])) + "%")
print("Failed Rate : " + str(sum(stat_status_qc['rate'][3:6])) + "%")

## Check the relationship informaiton and dstat message of successful samples

### fq2u issued relationship of successful samples

In [None]:
success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
print("The number of success samples: " + str(len(success_sample_df)))
success_sample=success_sample_df['sample'].to_list()

query = "Match (:Fastq)-[:INPUT_TO]->(j:Job:Dsub) WHERE not (j)-[:OUTPUT]->(:Ubam) and j.sample IN {} RETURN distinct j.sample AS sample_no_fq2utoubam".format(str(success_sample))
norelation_ubam = graph.run(query).to_data_frame()
print("The number of samples without the relationship between fq2u and ubam: " + str(len(norelation_ubam)))
print("\n")

display(norelation_ubam.head(3))

### gatk issued relationship of successful samples

In [None]:
# success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
# print("The number of success samples: " + str(len(success_sample_df)))
# success_sample=norelation_sample_df['sample'].to_list()

query = "Match (j:Job:Cromewell) WHERE (not (:Ubam)-[:INPUT_TO]->(j) or not (j)-[:OUTPUT]->(:Merged:Vcf) or not (j)-[:OUTPUT]->(:Cram) or not (j)-[:OUTPUT]->(:Crai)) and (j.sample IN {}) RETURN distinct j.sample AS sample_missed_output".format(str(success_sample))
norelation_output = graph.run(query).to_data_frame()
print("The number of samples without the relationship between gatk and outputs: " + str(len(norelation_output)))

display(norelation_output.head(3))
#display(set(norelation_sample)-set(norelation_dstat['sample']))

### Issued fq2u dstat message in successful samples

In [None]:
# success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
# print("The number of success samples: " + str(len(success_sample_df)))
# success_sample=norelation_sample_df['sample'].to_list()

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.readGroup As RG, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(success_sample))
issued_fq2u_dstat = graph.run(query).to_data_frame()
print("The number of samples with 'FAILURE' as dstat message  : " + str(len(issued_fq2u_dstat)))

if(len(issued_fq2u_dstat)):
    ## Issued Sample List
    display(issued_fq2u_dstat.head(3))

    ## Classified by Messange
    display(issued_fq2u_dstat.groupby(['Message','status']).count())

### Issued gatk dstat message in successful samples

In [None]:
# success_sample_df = sample_qc_df[sample_qc_df['status']=='success']
# print("The number of success samples: " + str(len(success_sample_df)))
# success_sample=norelation_sample_df['sample'].to_list()

query = "Match (j:Job:Cromwell)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(success_sample))
issued_gatk_dstat = graph.run(query).to_data_frame()
print("The number of samples with 'FAILURE' as dstat message  : " + str(len(issued_gatk_dstat)))


if(len(issued_gatk_dstat)):
    ## Issued Sample List
    display(issued_gatk_dstat.head(3))
    
    ## Classified by Message
    display(issued_gatk_dstat.groupby(['Message','status']).count())
    

### Add relationship information and issued dstat message

In [None]:
pd.set_option('display.float_format', lambda x: '%.f' % x)

success_re_dstat_df=pd.DataFrame()
success_re_dstat_df['sample']=success_sample_df['sample']

## Add the column in 'success_re_dstat_df' with input and output connections of fq2u jobs
try:
  success_re_dstat_df.loc[success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam']),'re_fq2u']="missing"
  success_re_dstat_df.loc[~success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam']),'re_fq2u']="all"
except:
  print("No missing input and output connections of fq2u jobs")
  if len(success_re_dstat_df):
    success_re_dstat_df.loc[:,'re_fq2u']="all"

## Add the column in 'success_re_dstat_df' with input and output connections of gatk jobs
try:
  success_re_dstat_df.loc[success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="missing"
  success_re_dstat_df.loc[~success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="all"
except:
  print("No missing input and output connections of gatk jobs")
  if len(success_re_dstat_df):
    success_re_dstat_df.loc[:,'re_gatk']="all"


## Add the column in 'success_re_dstat_df' with dstat message

try:
  issued_fq2u_dstat_df=issued_fq2u_dstat[['sample','RG','Message','log']]
  issued_fq2u_dstat_df.rename(columns={'RG': 'dstat_job', 'Message': 'dstat_msg'}, inplace=True)
  success_re_dstat_df=success_re_dstat_df.merge(issued_fq2u_dstat_df, on='sample', how='left')
except:
  print("No issued dstat message of fq2u jobs")

try:
  issued_gatk_dstat_df=issued_gatk_dstat[['sample','Message','log']]
  issued_gatk_dstat_df.rename(columns={'Message': 'dstat_msg'}, inplace=True)
  issued_gatk_dstat_df.loc[:,'dstat_job']="GATK"
  success_re_dstat_df=success_re_dstat_df.merge(issued_gatk_dstat_df, on='sample', how='left')
except:
  print("No issued dstat message of gatk jobs")

display(success_re_dstat_df.head(3))

if len(norelation_ubam):
  display(success_re_dstat_df[success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam'])])
if len(issued_fq2u_dstat):
  display(success_re_dstat_df[success_re_dstat_df['sample'].isin(issued_fq2u_dstat['sample'])])


sample_qc_df=sample_qc_df.merge(success_re_dstat_df, on='sample', how='left')
display(sample_qc_df.head(3))

if len(norelation_ubam):
  display(sample_qc_df[sample_qc_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam'])])
if len(issued_fq2u_dstat):
  display(sample_qc_df[sample_qc_df['sample'].isin(issued_fq2u_dstat['sample'])])

## Check dstat message of samples with duplicated jobs and nodes

### Duplicated Jobs (Fq2u)

In [None]:
dupjobs_sample_df = sample_qc_df[sample_qc_df['status']=='duplicated jobs']
print("The number of success samples with duplicated jobs: " + str(len(dupjobs_sample_df)))
dupjobs_sample=dupjobs_sample_df['sample'].to_list()

#display(dupjobs_sample_df)

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjobs_sample))
dupfq2u_dstat = graph.run(query).to_data_frame()
print("The number of duplicated fq2u jobs : " + str(len(dupfq2u_dstat)))

## Issued Sample and Job List
display(dupfq2u_dstat.head())

## Classified by Messange
dupfq2u_dstat.groupby(['Message','status']).count()

### Duplicated Jobs (GATK)

In [None]:
#dupjobs_sample_df = sample_qc_df[sample_qc_df['status']=='duplicated jobs']
print("The number of success samples with duplicated jobs: " + str(len(dupjobs_sample_df)))
#dupjobs_sample=dupjobs_sample_df['sample'].to_list()

#display(dupjobs_sample_df)

query = "Match (j:Job:CromwellWorkflow)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupjobs_sample))
dupgatk_dstat = graph.run(query).to_data_frame()
print("The number of duplicated gatk jobs : " + str(len(dupgatk_dstat)))
display(dupgatk_dstat.head(3))

dupgatk_dstat.groupby(['Message','status']).count()

### Duplicated Job and Node

In [None]:
dupnodes_sample_df = sample_qc_df[sample_qc_df['status']=='duplicated nodes']
print("The number of success samples with duplicated nodes: " + str(len(dupnodes_sample_df)))
dupnodes_sample=dupnodes_sample_df['sample'].to_list()

display(dupnodes_sample_df)

query = "Match (j:Job:CromwellWorkflow)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(dupnodes_sample))
dupnodes_dstat = graph.run(query).to_data_frame()
print("The number of duplicated nodes: " + str(len(dupnodes_dstat)))
display(dupnodes_dstat.head(3))

dupnodes_dstat.groupby(['Message','status']).count()

## Check the log list and the dstat message of issued jobs

### Failed Fq2u

In [None]:
failed_fq2u_sample_df = sample_qc_df[sample_qc_df['status']=='failed fq2u']
print("The number of samples with failed fq2u jobs: " + str(len(failed_fq2u_sample_df)))
failed_fq2u_sample=failed_fq2u_sample_df['sample'].to_list()

query = "Match (j:Job:Dsub)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.readGroup AS RG, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(failed_fq2u_sample))
failed_fq2u_dstat = graph.run(query).to_data_frame()
print("The number of failed fq2u jobs : " + str(len(failed_fq2u_dstat)))
display(failed_fq2u_dstat.head(3))

if(len(failed_fq2u_dstat)):
  display(failed_fq2u_dstat.groupby('Message').count())
  


### Failed GATK

In [None]:
failed_gatk_sample_df = sample_qc_df[sample_qc_df['status']=='failed gatk']
print("The number of samples with failed gatk jobs: " + str(len(failed_gatk_sample_df)))
failed_gatk_sample=failed_gatk_sample_df['sample'].to_list()

query = "Match (j:Job:CromwellWorkflow)-[:STATUS]->(s:Dstat) WHERE j.sample IN {} and not s.status='SUCCESS' RETURN j.sample AS sample, j.logging As log, s.status AS status, s.statusMessage AS Message".format(str(failed_gatk_sample))
failed_gatk_dstat = graph.run(query).to_data_frame()
print("The number of duplicated gatk jobs : " + str(len(failed_gatk_dstat)))
display(failed_gatk_dstat.head(3))

if(len(failed_gatk_dstat)):
  display(failed_gatk_dstat.groupby('Message').count())

### Add failed dstat message

In [None]:
pd.set_option('display.float_format', lambda x: '%.f' % x)

failed_re_dstat_df=pd.DataFrame()

if len(failed_fq2u_dstat):
  df1=failed_fq2u_dstat[['sample','RG','Message','log']]
  df2=failed_fq2u_dstat[['sample','Message','log']]
  failed_re_dstat_df=pd.concat([df1,df2])
  failed_re_dstat_df.rename(columns={'RG': 'dstat_job', 'Message': 'dstat_msg'}, inplace=True)

## Add the column in 'failed_re_dstat_df' with failed fq2u jobs
# try:
#   failed_re_dstat_df.loc[failed_re_dstat_df['sample'].isin(failed_fq2u_sample),'re_fq2u']="missing"
#   failed_re_dstat_df.loc[~failed_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam']),'re_fq2u']="all"
# except:
#   print("No missing input and output connections of fq2u jobs")
#   success_re_dstat_df.loc[:,'re_fq2u']="all"

# ## Add the column in 'success_re_dstat_df' with input and output connections of gatk jobs
# try:
#   success_re_dstat_df.loc[success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="missing"
#   success_re_dstat_df.loc[~success_re_dstat_df['sample'].isin(norelation_output['sample_missed_output']),'re_gatk']="all"
# except:
#   print("No missing input and output connections of gatk jobs")
#   success_re_dstat_df.loc[:,'re_gatk']="all"


# ## Add the column in 'success_re_dstat_df' with dstat message

# try:
#   issued_fq2u_dstat_df=issued_fq2u_dstat[['sample','RG','Message','log']]
#   issued_fq2u_dstat_df.rename(columns={'RG': 'dstat_job', 'Message': 'dstat_msg'}, inplace=True)
#   success_re_dstat_df=success_re_dstat_df.merge(issued_fq2u_dstat_df, on='sample', how='left')
# except:
#   print("No issued dstat message of fq2u jobs")

# try:
#   issued_gatk_dstat_df=issued_gatk_dstat[['sample','Message','log']]
#   issued_gatk_dstat_df.rename(columns={'Message': 'dstat_msg'}, inplace=True)
#   issued_gatk_dstat_df.loc[:,'dstat_job']="GATK"
#   success_re_dstat_df=success_re_dstat_df.merge(issued_gatk_dstat_df, on='sample', how='left')
# except:
#   print("No issued dstat message of gatk jobs")

display(failed_re_dstat_df)
# display(success_re_dstat_df[success_re_dstat_df['sample'].isin(norelation_ubam['sample_no_fq2utoubam'])])
# display(success_re_dstat_df[success_re_dstat_df['sample'].isin(issued_fq2u_dstat['sample'])])