In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import h5py
import numpy as np
import pandas as pd
import logging
import re

from matplotlib import pyplot as plt
import seaborn as sns
sns.set(font_scale=1.8, style="whitegrid")

In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.DEBUG,
                        handlers=[logging.StreamHandler()])

# Import run settings

In [6]:
def import_gdrive_sheet(gdrive_key, sheet_id):
    run_spreadsheet = pd.read_csv("https://docs.google.com/spreadsheet/ccc?key=" + \
                                  gdrive_key + "&output=csv&gid=" + sheet_id)
    if "date" in run_spreadsheet.columns:
        run_spreadsheet["date"] = run_spreadsheet["date"].astype(str)
    return run_spreadsheet

gdrive_key = "gsheet_id_here"
sheet_id = "0"
set_sheet_id = "512509543"

run_spreadsheet = import_gdrive_sheet(gdrive_key, sheet_id)
set_spreadsheet = import_gdrive_sheet(gdrive_key, set_sheet_id)

In [7]:
molbit_file = "../porcupine_sequences.fa"

In [10]:
training_run_names = ["08_09_2019_run_01",
                      "08_13_2019_run_02",
                      "08_13_2019_run_03",
                      "08_15_2019_run_02",
                      "08_15_2019_run_03",
                      "08_16_2019_run_01",]

In [13]:
training_runs = run_spreadsheet[run_spreadsheet["run_name"].apply(lambda x: x in training_run_names)]

In [14]:
training_runs

Unnamed: 0,date,run_name,molbit_set,description,approx_runtime,approx_reads_per_min,read_count,lengths,DNAse,flowcell,...,mako_calls_20181203,mako_labels_20181203,mako_calls_20190304,mako_labels_20190304,mako_calls_20190306,mako_labels_20190306,sw_calls_pval_v2,sw_labels_pval_v2,sw_calls_pval_v3,sw_labels_pval_v3
0,20190809,08_09_2019_run_01,0,training data,0:20,10000.0,209650,400,no,FAL16929,...,,,,,,,,,,
2,20190813,08_13_2019_run_02,1,training data,0:35,,304000,400,no,FAL16929,...,,,,,,,,,,
3,20190813,08_13_2019_run_03,2,training data,0:47,,305380,400,no,FAL16929,...,,,,,,,,,,
5,20190815,08_15_2019_run_02,4,training data,0:34,10000.0,318660,400,no,FAL01293,...,,,,,,,,,,
6,20190815,08_15_2019_run_03,5,training data,0:42,,309290,400,no,FAL01293,...,,,,,,,,,,
7,20190816,08_16_2019_run_01,3,training data,0:25,,328150,400,no,FAL01293,...,,,,,,,,,,


## Define which molbits are in each set

In [15]:
# Create set_N variables based on spreadsheet
molbit_sets = {}
for ix, row in set_spreadsheet.iterrows():
    set_no = re.findall(r"set ([\d]+)", row["set"])[0]
    molbits = row["molbits_in_set"]
    molbit_sets[set_no] = molbits.split(", ")

## Specify which molbits are present in each run

In [20]:
molbits_by_run = {}
for i, run_data in training_runs.iterrows():
    run_name = run_data["run_name"]
    molbits_by_run[run_name] = {}
    print(run_name)
    
    molbit_set_in_run = str(run_data.get("molbit_set"))
    molbit_sets_on_flowcell = run_data.get("prev_on_flowcell")

    molbits_in_run = molbit_sets[molbit_set_in_run]
    molbits_on_flowcell = molbits_in_run[:]
    if molbit_sets_on_flowcell != "none":
        molbit_sets_on_flowcell = molbit_sets_on_flowcell.split(", ")
        for m in molbit_sets_on_flowcell:
            molbits_on_flowcell.extend(molbit_sets[m])
    molbits_by_run[run_name]["molbits_in_run"] = molbits_in_run
    molbits_by_run[run_name]["molbits_on_flowcell"] = molbits_on_flowcell

08_09_2019_run_01
08_13_2019_run_02
08_13_2019_run_03
08_15_2019_run_02
08_15_2019_run_03
08_16_2019_run_01


## Create new file with only the reads/molbits we want to use for training

In [40]:
for i, run_data in training_runs.iterrows():
    run_name = run_data["run_name"]
    molbits_in_run = molbits_by_run[run_name]["molbits_in_run"]
    sw_calls_file = run_data["sw_calls_file"]
    sw = pd.read_csv(sw_calls_file, sep="\t", index_col=0)
    
    sw_scores = sw.filter(regex="score_molbit_.*")
    best_molbits = sw_scores.apply(np.argmax, axis=1)
    accept_molbit = sw.lookup(sw.index, best_molbits) >= 15
    sw["best_molbit"] = best_molbits.str.extract(r"score_molbit_([\d]+)")
    sw["accept_molbit"] = accept_molbit
    sw["best_molbit_is_in_run"] = sw["best_molbit"].apply(lambda x: x in molbits_in_run)
    
    use_for_training = sw[np.logical_and(sw["accept_molbit"], sw["best_molbit_is_in_run"])]
    
    training_calls_file = sw_calls_file.replace("all", "filtered_molbits_in_run")
    print(training_calls_file)
    
    best_score = np.max(use_for_training[[f"sw_score_molbit_{molbit}" for molbit in molbits_in_run]], axis=1)
    use_for_training["best_score"] = best_score

    use_for_training = use_for_training.loc[:, ["best_molbit", "best_score"]]
    use_for_training.columns = ["molbit", "sw_score"]
    
    use_for_training.to_csv(training_calls_file, sep="\t", index=True)

  # Remove the CWD from sys.path while we load stuff.


/path/to/data/MinION_sequencing_data_20190809/guppy_3.2.2_08_09_19_run_01_exec_20190809/sw/filtered_molbits_in_run_536cafc874cb995b6c8b47a5baabb9598eaded02.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


/path/to/data/MinION_sequencing_data_20190813/guppy_3.2.2_08_13_2019_run_02_exec_20190813/sw/filtered_molbits_in_run_cbeb224beba4132d960067800238defaeed962bf.tsv
/path/to/data/MinION_sequencing_data_20190813/guppy_3.2.2_08_13_2019_run_03_exec_20190814/sw/filtered_molbits_in_run_7efa32b6b7c4b21dadad2a4c078a9f93bd0cc657.tsv
/path/to/data/MinION_sequencing_data_20190815/guppy_3.2.2_08_15_2019_run_02_exec_20190815/sw/filtered_molbits_in_run_e33d821ff769cbb24ed5af866a50887f3e1ea5c3.tsv
/path/to/data/MinION_sequencing_data_20190815/guppy_3.2.2_08_15_2019_run_03_exec_20190816/sw/filtered_molbits_in_run_3af76ac4a3c3302724a333ac8c108c258c9ceb0d.tsv
/path/to/data/MinION_sequencing_data_20190816/guppy_3.2.2_08_16_2019_run_01_exec_20190819/sw/filtered_molbits_in_run_c00d4d8a06893e7015df48ef09339662a1747216.tsv


In [42]:
use_for_training

Unnamed: 0_level_0,molbit,sw_score
read_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ea9cc3cd-9bad-4c0b-861a-dcbe6edf2546,62,19.0
0944069b-2066-4cd2-93f2-747432dbfe44,57,24.0
ecb61ab2-04b5-43c0-aec7-21e5e328e832,60,32.0
569620ef-2c59-461d-bdd8-87bab42d5200,60,23.0
26d66a8f-49a3-4f2a-aaf2-067f390809ee,48,35.0
8eb47a61-bd26-47d7-8856-ed6b374518fa,51,31.0
85fbec01-fcc6-4c53-a80f-bd495363ab0c,58,37.0
74d59968-ae05-4052-bbf3-50115ac25f1a,58,37.0
a2ce7d35-03fe-4c96-bdf9-162087831757,57,20.0
94a5d927-c398-4b3e-9119-8d62d5c488d1,63,25.0
