Neo4J v057 test1 - Trellis : Job based Analysis
================

## Set up the environment


### Install py2neo for querying Neo4J 

In [3]:
#!pip3 install -U py2neo

# add python path of py2neo in system

#!pip3 install -U neotime
#!pip3 install -U neobolt

### Import Packages

In [4]:
from py2neo import Graph
from google.cloud import storage
import yaml

import pandas as pd
import numpy as np
import subprocess
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

#plt.style.use('fivethirtyeight')
#plt.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Load Neo4J DB

In [5]:
## Option 1 : Read DB and Account Information in Google Storage (YAML)

# create storage client
storage_client = storage.Client()
# get bucket with name
bucket = storage_client.get_bucket('gbsc-gcp-project-mvp-{env}-trellis')
# get bucket data as blob
blob = bucket.get_blob('credentials/{env}-wgs35.yaml')
# convert to string
yaml_data = blob.download_as_string()

account = yaml.load(yaml_data, Loader=yaml.FullLoader)

## Main Account
graph = Graph(account['NEO4J_SCHEME']+'://'+account['NEO4J_HOST']+":"+str(account['NEO4J_PORT']), auth=(account['NEO4J_USER'],account['NEO4J_PASSPHRASE']))

## Option 2 : Read DB and Account Information in VM (TXT)

# f=open("./authentication/account.txt","r")
# lines=f.readlines()
# db=lines[0][:-1]
# username=lines[1][:-1]
# password=lines[2][:-1]
# f.close()

-------
## FQ2U Job

### FQ2U table

In [26]:
## Query
query = "Match (fu:Job:Dsub {name:'fastq-to-ubam'})-[:STATUS]->(s:Dstat) RETURN fu.sample AS sample, fu.readGroup AS fq2urg_gatkid, fu.duplicate AS dup, fu.machineType AS VMtype, fu.durationMinutes as runtime, s.status as dstat_status, s.statusMessage as dstat_msg, s.logging as dstat_log"
job_fq2u = graph.run(query).to_data_frame()
job_fq2u.set_index('sample')

## Variable
num_fq2u_sample=len(job_fq2u['sample'].unique())
num_fq2u_job=len(job_fq2u)

## Print (Info)
print("The number of samples with FQ2U jobs : " + str(num_fq2u_sample))
print("The number of FQ2U jobs : " + str(num_fq2u_job))

## Bigquery Table Format
job_fq2u['job']='FQ2U'
job_fq2u['attempts']=None
job_fq2u['unit_runtime']=None
columnlist=['sample','job','fq2urg_gatkid','dup','attempts','runtime','unit_runtime','VMtype','dstat_status','dstat_msg','dstat_log']
job_fq2u=job_fq2u[columnlist]

#display(job_fq2u.head())

The number of samples with FQ2U jobs : 288
The number of FQ2U jobs : 1155


### FQ2U Duplication Check

In [17]:
## Query
fq2u_dup=job_fq2u.loc[job_fq2u['dup']==True,:]

## Variable
num_dup_fq2u_sample=len(fq2u_dup['sample'].unique())
num_dup_fq2u_job=len(fq2u_dup)

print("The number(percentage) of samples with duplicated FQ2U jobs : " + str(len(fq2u_dup['sample'].unique()))+" ("+'{:2f}'.format((num_dup_fq2u_sample/num_fq2u_sample)*100)+"%)")
print("The number(percentage) of FQ2U duplicated jobs : " + str(num_dup_fq2u_job)+" ("+'{:2f}'.format((num_dup_fq2u_job/num_fq2u_job)*100)+"%)")

#display(test)

The number(percentage) of samples with duplicated FQ2U jobs : 0 (0.000000%)
The number(percentage) of FQ2U duplicated jobs : 0 (0.000000%)


-------
## GATK Job

### GATK table

In [27]:
## Query
query = "MATCH (j:Job:CromwellWorkflow)-[:STATUS]->(s:Dstat) RETURN j.sample AS sample, j.cromwellWorkflowId AS fq2urg_gatkid, \
j.duplicate AS dup, j.durationMinutes as runtime, j.machineType as VMtype, s.status as dstat_status, s.statusMessage as dstat_msg, s.logging as dstat_log"
job_gatk = graph.run(query).to_data_frame()
job_gatk.set_index('sample')

## Variable
num_gatk_sample=len(job_gatk['sample'].unique())
num_gatk_job=len(job_gatk)

## Print (Info)
print("The number of samples with GATK jobs : " + str(num_gatk_sample))
print("The number of GATK jobs : " + str(num_gatk_job))

## Bigquery Table Format
job_gatk['job']='GATK'
job_gatk['attempts']=None
job_gatk['unit_runtime']=None
columnlist=['sample','job','fq2urg_gatkid','dup','attempts','runtime','unit_runtime','VMtype','dstat_status','dstat_msg','dstat_log']
job_gatk=job_gatk[columnlist]

display(job_gatk.head())

The number of samples with GATK jobs : 286
The number of GATK jobs : 286


Unnamed: 0,sample,job,fq2urg_gatkid,dup,attempts,runtime,unit_runtime,VMtype,dstat_status,dstat_msg,dstat_log
0,SHIP4962328,GATK,b034c09e-9bc2-4fb5-9a48-6e47bf379b6e,,,1427,,custom-2-12288,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
1,SHIP4962321,GATK,db779834-315c-48df-a3d8-1f2485674de3,,,877,,custom-2-12288,FAILURE,ient.googleapis.services.AbstractGoogleClient ...,gs://gbsc-gcp-project-mvp-test-from-personalis...
2,SHIP4962320,GATK,36bdca37-5ee8-4017-81a3-f031b4977fcc,,,1842,,custom-2-12288,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
3,SHIP5119463,GATK,9ff28026-c993-4387-99bb-06bab3e6c175,,,1773,,custom-2-12288,FAILURE,nt.googleapis.services.AbstractGoogleClient <i...,gs://gbsc-gcp-project-mvp-test-from-personalis...
4,SHIP5119434,GATK,b40bc509-3301-458f-adc6-bbba98d7acfb,,,1527,,custom-2-12288,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...


### GATK Duplication Check

In [19]:
## Query
gatk_dup=job_gatk.loc[job_gatk['dup']==True,:]

## Variable
num_dup_gatk_sample=len(gatk_dup['sample'].unique())
num_dup_gatk_job=len(gatk_dup)

print("The number(percentage) of samples with duplicated GATK jobs : " + str(len(gatk_dup['sample'].unique()))+" ("+'{:2f}'.format((num_dup_gatk_sample/num_gatk_sample)*100)+"%)")
print("The number(percentage) of GATK duplicated jobs : " + str(num_dup_gatk_job)+" ("+'{:2f}'.format((num_dup_gatk_job/num_gatk_job)*100)+"%)")

#display(test)

The number(percentage) of samples with duplicated GATK jobs : 0 (0.000000%)
The number(percentage) of GATK duplicated jobs : 0 (0.000000%)


-------
## GATK substeps

### Attempts table

In [30]:
## Query
query = "MATCH (g:Job:CromwellWorkflow)-[:LED_TO*]->(s:CromwellStep)-[:HAS_ATTEMPT]-()-[*0..100]->(j:Job) \
WHERE g.cromwellWorkflowId=s.cromwellWorkflowId RETURN g.sample as sample, s.cromwellWorkflowId as fq2urg_gatkid, \
s.wdlCallAlias as job, count(distinct j) as attempts, (max(j.stopTimeEpoch)-min(j.startTimeEpoch))/60 as runtime, avg(j.durationMinutes) as unit_runtime, j.machineType as VMtype"
#query = "MATCH (j:Job:CromwellWorkflow)-[:STATUS]->(s:Dstat) RETURN j.sample AS sample, j.duplicate AS dup, j.durationMinutes as runtime, s.status as dstat_status, s.statusMessage as dstat_msg, s.logging as dstat_log"
job_gatk_step = graph.run(query).to_data_frame()
job_gatk_step.set_index('sample')

## Variable
num_gatk_sample=len(job_gatk_step['sample'].unique())
num_gatk_subjobs=len(job_gatk_step)

## Print (Info)
print("The number of samples with GATK steps : " + str(num_gatk_sample))
print("The number of GATK subjobs : " + str(num_gatk_subjobs))

The number of samples with GATK steps : 286
The number of GATK subjobs : 4452


### Bring dup and dstat info in GATK step table

In [33]:
## Bigquery Table Format
job_gatk_info=job_gatk[['sample','fq2urg_gatkid','dup','dstat_status','dstat_msg','dstat_log']]
job_gatk_stepm=pd.merge(job_gatk_info, job_gatk_step, left_on=['sample','fq2urg_gatkid'], right_on=['sample','fq2urg_gatkid'], how='right')
columnlist=['sample','job','fq2urg_gatkid','dup','attempts','runtime','unit_runtime','VMtype','dstat_status','dstat_msg','dstat_log']
job_gatk_stepm=job_gatk_stepm[columnlist]

display(job_gatk_stepm.head())

Unnamed: 0,sample,job,fq2urg_gatkid,dup,attempts,runtime,unit_runtime,VMtype,dstat_status,dstat_msg,dstat_log
0,SHIP4962328,getbwaversion,b034c09e-9bc2-4fb5-9a48-6e47bf379b6e,,1,3.43,3.0,custom-1-1024,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
1,SHIP4962328,createsequencegroupingtsv,b034c09e-9bc2-4fb5-9a48-6e47bf379b6e,,1,3.43,3.0,custom-1-2048,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
2,SHIP4962328,scatterintervallist,b034c09e-9bc2-4fb5-9a48-6e47bf379b6e,,1,3.93,3.0,custom-1-2048,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
3,SHIP4962328,samtofastqandbwamemandmba,b034c09e-9bc2-4fb5-9a48-6e47bf379b6e,,5,169.75,131.5,custom-16-14848,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
4,SHIP4962328,sumfloats,b034c09e-9bc2-4fb5-9a48-6e47bf379b6e,,1,3.08,3.0,custom-2-10240,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...


-------
## Merge 

In [36]:
job_df=pd.concat([job_fq2u,job_gatk,job_gatk_stepm]).sort_values(['sample','job','fq2urg_gatkid'])
display(job_df)

Unnamed: 0,sample,job,fq2urg_gatkid,dup,attempts,runtime,unit_runtime,VMtype,dstat_status,dstat_msg,dstat_log
486,SHIP4946367,FQ2U,0,,,56.00,,custom-2-7680,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
501,SHIP4946367,FQ2U,1,,,54.00,,custom-2-7680,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
599,SHIP4946367,FQ2U,2,,,55.00,,custom-2-7680,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
612,SHIP4946367,FQ2U,3,,,55.00,,custom-2-7680,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
130,SHIP4946367,GATK,73470e9e-1a2d-4ad6-b55d-a8a3ddc54700,,,1705.00,,custom-2-12288,FAILURE,ient.googleapis.services.AbstractGoogleClient ...,gs://gbsc-gcp-project-mvp-test-from-personalis...
...,...,...,...,...,...,...,...,...,...,...,...
1796,SHIP5141934,samtofastqandbwamemandmba,ab5fe741-accc-4baa-b406-5275577e177c,,5,414.14,203.20,custom-16-14848,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
1795,SHIP5141934,scatterintervallist,ab5fe741-accc-4baa-b406-5275577e177c,,1,4.16,4.00,custom-1-2048,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
1799,SHIP5141934,sortsamplebam,ab5fe741-accc-4baa-b406-5275577e177c,,1,270.81,270.00,custom-2-5120,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...
1797,SHIP5141934,sumfloats,ab5fe741-accc-4baa-b406-5275577e177c,,1,2.97,2.00,custom-2-10240,SUCCESS,Success,gs://gbsc-gcp-project-mvp-test-from-personalis...


In [35]:
job_df.to_csv('job-based-analysis-v057.csv',index=False)