In [None]:
import requests
import tables
import io
import scipy
import numpy
from scipy import sparse
import pandas
import os

In [None]:
HOST_URL = "https://scrap.caltech.edu/"
SEQUENCING_RUN_NAME = "JPMT35"

SAMPLES = [
    "PBMC-MULT-14-1",
    "PBMC-MULT-14-2",
    "PBMC-MULT-14-3",
    "PBMC-MULT-14-4",
    "PBMC-MULT-14-5",
    "PBMC-MULT-14-6",
    "PBMC-MULT-16-1",
    "PBMC-MULT-16-2",
    "PBMC-MULT-16-3",
    "PBMC-MULT-16-4",
    "PBMC-MULT-16-5",
    "PBMC-MULT-16-6",
    "PBMC-MULT-14-1-TAGS",
    "PBMC-MULT-14-2-TAGS",
    "PBMC-MULT-14-3-TAGS",
    "PBMC-MULT-14-4-TAGS",
    "PBMC-MULT-14-5-TAGS",
    "PBMC-MULT-14-6-TAGS",
    "PBMC-MULT-16-1-TAGS",
    "PBMC-MULT-16-2-TAGS",
    "PBMC-MULT-16-3-TAGS",
    "PBMC-MULT-16-4-TAGS",
    "PBMC-MULT-16-5-TAGS",
    "PBMC-MULT-16-6-TAGS"
]

# CELL_SET_NAMES = [
#     "LC2-B-V-2",
#     "LM1-B-V",
#     "LM1-S-2"
# ]

CELL_SET_NAMES = SAMPLES

SAMPLE_FASTQ_PREFIXES = SAMPLES

In [None]:
# Create a cell set for each sample, or get it if it already exists
cell_sets_URL = HOST_URL + "cell_sets"

response = requests.get(cell_sets_URL, verify=False)
cell_sets = response.json()["cell_sets"]

sample_cell_sets = {}

for sample_index, sample in enumerate(SAMPLES):
    cell_set_id = None

    for cell_set in cell_sets:
        if cell_set["name"] == CELL_SET_NAMES[sample_index]:
            cell_set_id = cell_set["_id"]
            break

    if not cell_set_id:
        cell_set = {}
        cell_set["name"] = sample
        
        print("Creating Cell Set %s" % sample)
        
        response = requests.post(cell_sets_URL, json=cell_set, verify=False)
        cell_set = response.json()
        sample_cell_sets[sample] = cell_set
    else:
        sample_cell_sets[sample] = cell_set

In [None]:
# Find the sequencing run associated with these samples, or create it if it doesn't exist
sequencing_runs_URL = HOST_URL + "sequencing_runs"

response = requests.get(sequencing_runs_URL, verify=False)
sequencing_runs = response.json()["sequencing_runs"]

sequencing_run_id = None

for sequencing_run in sequencing_runs:
    if sequencing_run["name"] == SEQUENCING_RUN_NAME:
        sequencing_run_id = sequencing_run["_id"]
        break

if not sequencing_run_id:
    sequencing_run = {}
    sequencing_run["type"] = "local_BCLs"
    sequencing_run["name"] = SEQUENCING_RUN_NAME
    sequencing_run["status"] = "available"
    
    print("Creating sequencing run %s" % SEQUENCING_RUN_NAME)
    response = requests.post(sequencing_runs_URL, json=sequencing_run, verify=False)
    sequencing_run_id = response.json()["_id"]

In [None]:
sequencing_run

In [None]:
S3_SEARCH_DIRECTORY = "sequencing_runs/20200116T013344Z_JPMT35-2/200109_A00351_0310_AH2MH3DSXY_10x/JPMT35/"

read_sets_URL = HOST_URL + "read_sets"
response = requests.get(read_sets_URL, verify=False)
read_sets = response.json()["read_sets"]

out = os.popen('aws s3 ls s3://scrap-dm/%s' % S3_SEARCH_DIRECTORY).read() # list all files found in an S3 folder
out = out.split('\n') # create a list from string output
out = [x.split(' ')[-1] for x in out if x.endswith('gz')] # only get .gz elements and file names

In [None]:
for sample_index, sample_name in enumerate(SAMPLES):
    
    read_set_id = None
    
    for read_set in read_sets:
        if read_set["name"] == sample_name:
            read_set_id = read_set["_id"]
            break

    if not read_set_id:
        read_set = {}
        read_set["name"] = sample_name
        read_set["status"] = "available"
        
        print("Creating read set %s" % sample_name)
        
        response = requests.post(read_sets_URL, json=cell_set, verify=False)
        read_set = response.json()
        read_set["sequencing_run_id"] = sequencing_run_id
        read_set["status"] = "available"
        read_set["cell_set_id"] = sample_cell_sets[sample_name]["_id"]
        
        response = requests.put(read_sets_URL + "/%s" % read_set["_id"], json=read_set, verify=False)
        read_set = response.json()
    
    print(read_set["path_id"])
    
    destination_files = os.popen('aws s3 ls s3://scrap-dm/reads/%s/' % read_set["path_id"]).read() # list all files found in an S3 folder
    destination_files = destination_files.split('\n') # create a list from string output
    destination_files = [x.split(' ')[-1] for x in destination_files if x.endswith('.fastq.gz')] # only get .gz elements and file names
    
    source_files = []
    
    for file_name in out:
        file_sample_name = file_name.split("_")
        file_sample_name = "_".join(file_sample_name[0:-4])
        if file_sample_name == SAMPLE_FASTQ_PREFIXES[sample_index]:
            source_files.append(file_name)
            
    print("Source files:")
    print(source_files)
    print("Destination files:")
    print(destination_files)
    
    for source_file in source_files:
        new_file_name = source_file
        while new_file_name in destination_files:
            new_file_name_parts = new_file_name[0:-9].split("_")
            new_file_name = "_".join(new_file_name_parts[0:-1]) + "_%03d" % (int(new_file_name_parts[-1])+1) + ".fastq.gz"
        
        if new_file_name != source_file:
            print("Will rename %s to %s" % (source_file, new_file_name))
            
        source_path = "s3://scrap-dm/%s%s" % (S3_SEARCH_DIRECTORY, source_file)
        destination_path = "s3://scrap-dm/reads/%s/%s" % (read_set["path_id"], new_file_name)
        print("Moving %s to %s" % (source_path, destination_path))
        os.system('aws s3 mv %s %s' % (source_path, destination_path))
        read_set["FASTQ_files"][new_file_name] = {"remote_path" : new_file_name, "status" : "available"}
        response = requests.put(read_sets_URL + "/%s" % read_set["_id"], json=read_set, verify=False)