# Neo4J Trellis v1.2: Sample based Analysis

    -  Check the success case only based on outputs
    -  Load the account info from Google Cloud Storage
    -  Remove Duplicate jobs and Duplicate jobs and nodes status
    -  Add 'dup' column with the number of dupilcated jobs
    
================

In [None]:
!pip3 install py2neo

# *** add python path of py2neo in system

!pip3 install neotime
!pip3 install neobolt
!pip3 install pandas-gbq

!pip3 install papermill

!pip3 install google-cloud-storage
!pip3 install matplotlib
!pip3 install seaborn
!pip3 install fsspec
!pip3 install gcsfs

In [None]:
import py2neo as neo
from py2neo import Graph

from google.cloud import storage
import yaml

import pandas as pd
import pandas_gbq

import time
import numpy as np
import subprocess
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery

pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Connect to Neo4j database

In [None]:
# Load this from environment variables?
bucket_info=''
credential_info=''

In [None]:
## Option 1 : Read DB and Account Information in Google Storage (YAML)

# create storage client
storage_client = storage.Client()
# get bucket with name
bucket = storage_client.get_bucket(bucket_info)
# get bucket data as blob
blob = bucket.get_blob(credential_info)
# convert to string
yaml_data = blob.download_as_string()

account = yaml.load(yaml_data, Loader=yaml.FullLoader)

## Main Account
graph = Graph(account['NEO4J_SCHEME']+'://'+account['NEO4J_HOST']+":"+str(account['NEO4J_PORT']), auth=(account['NEO4J_USER'],account['NEO4J_PASSPHRASE']))

## Data import queries

### Count nodes

In [2]:
query = '''
    MATCH (s:Sample)
    RETURN COUNT(s)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [1]:
query = '''
    MATCH (s:Sample)
    WITH s.sample AS sample, COLLECT(s) AS samples
    WHERE size(samples) <> 1
    RETURN sample, size(samples)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [None]:
query = '''
    MATCH (f:Fastq)
    WITH f.sample AS sample, COLLECT(f) AS fastqs
    RETURN sample, size(fastqs)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [None]:
query = '''
    MATCH (p:PersonalisSequencing)
    WITH p.sample AS sample, COLLECT(p) AS seqs
    WHERE size(seqs) <> 1
    RETURN sample, size(seqs)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [None]:
query = '''
    MATCH (p:Person)
    WITH p.sample AS sample, COLLECT(p) AS persons
    WHERE size(persons) <> 1
    RETURN sample, size(persons)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [None]:
query = '''
    MATCH (g:BiologicalOme:Genome)
    WITH g.sample AS sample, COLLECT(g) AS genomes
    WHERE size(genomes) <> 1
    RETURN sample, size(genomes)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

### Compare the number of (:Genome)-[]->(:Fastq) relationships to (:Sequencing)-[]->(:Fastq) ones as cross validation

In [1]:
query = '''
    MATCH (s:Sample)<-[:GENERATED]-(:Person)-[:HAS_BIOLOGICAL_OME|HAS_SEQUENCING_READS*2]->(f1:Fastq)
    WITH s.sample AS sample, COLLECT(f1) AS bio_fastqs
    MATCH (s:Sample)-[:WAS_USED_BY|GENERATED*2]->(f2:Fastq)
    WHERE s.sample = sample
    WITH sample, bio_fastqs, COLLECT(f2) AS prov_fastqs
    WHERE size(bio_fastqs) <> size(prov_fastqs)
    RETURN sample, size(bio_fastqs), size(prov_fastqs)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

num_fastq.set_index('sample')

NameError: name 'time' is not defined

In [3]:
query = '''
    MATCH (g:Genome)-[:HAS_SEQUENCING_READS]->(f:Fastq)
    RETURN g.sample, COUNT(g), COUNT(f)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

num_fastq.set_index('sample')

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-3-54c20ea3f3cc>, line 8)

## Post-variant calling queries

In [None]:
query = '''
    MATCH (g:Genome)-[r]->(f)
    RETURN type(r), labels(f), COUNT(f)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [None]:
query = '''
    MATCH (g)-[r:HAS_INDEX]->(f)
    RETURN labels(g), labels(f), COUNT(r)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [None]:
query = '''
    MATCH (s:Sample)
    RETURN s.trellis_snvQa, COUNT(s)
'''
start = time.time()
result = graph.run(query).to_data_frame()
end = time.time()
print(end - start)

In [None]:
# Count the number of objects connected to the genome node and check that they still exist
query = '''
    MATCH (:Genome)-[]->(b)
    RETURN b.obj_exists, COUNT(b)
'''

In [None]:
# Make sure indexes haven't been deleted
query = '''
    MATCH ()-[:HAS_INDEX]->(b)
    RETURN b.obj_exists, COUNT(b)
'''