In [None]:
import os
import requests

In [None]:
# The URL to SCRAP
HOST_URL = "https://172.31.1.62/"

# The name of the Sequencing Run you are importing
SEQUENCING_RUN_NAME = "TKMT01"

S3_BUCKET = "scrap-dm"

# The existing S3 path to the sequencing run directory
SEQUENCING_RUN_DIRECTORY = "reads/210910_A01102_0279_AHM2KKDSX2_nomismatches/"

## Search for FASTQ files in directory

In [None]:
source_files = os.popen("aws s3 ls --recursive %s/%s" % (S3_BUCKET, SEQUENCING_RUN_DIRECTORY)).read() # list all files found in an S3 folder
source_files = source_files.split('\n') # create a list from string output
source_files = [x.split(' ')[-1] for x in source_files if x.endswith('.fastq.gz')]

In [None]:
new_sample_names = set()

# Loop through all the detected FASTQs and extract the unique sample names
for file in source_files:
    prefix, file_name = os.path.split(file)
    
    sample_name = "_".join(file_name.split("_")[0:-4])
    new_sample_names.add(sample_name)

# We don't want to save the undetermineds, so we remove them
new_sample_names.remove("Undetermined")

In [None]:
new_sample_names

## Create samples in SCRAP

In [None]:
# Get a list of all existing Cell Sets, just to make sure they don't exist already
cell_sets_URL = HOST_URL + "cell_sets"

response = requests.get(cell_sets_URL, verify=False)
cell_sets = response.json()["cell_sets"]

In [None]:
cell_sets_URL = HOST_URL + "cell_sets"

for cell_set_name in new_sample_names:
    
    found_cell_set = False
    
    for cell_set in cell_sets:
        if cell_set["name"] == cell_set_name:
            print("%s already exists -- using" % cell_set_name)
            found_cell_set = True
            break
            
    if found_cell_set:
        continue
        
    print("Creating cell set %s" % cell_set_name)
    
    cell_set = {
        "name": cell_set_name,
        "status": "available"
    }
    
    response = requests.post(cell_sets_URL, json=cell_set, verify=False)
    cell_set = response.json()

In [None]:
# Find the sequencing run associated with these samples, or create it if it doesn't exist
sequencing_runs_URL = HOST_URL + "sequencing_runs"

response = requests.get(sequencing_runs_URL, verify=False)
sequencing_runs = response.json()["sequencing_runs"]

sequencing_run_id = None

for sequencing_run in sequencing_runs:
    if sequencing_run["name"] == SEQUENCING_RUN_NAME:
        sequencing_run_id = sequencing_run["_id"]
        print("Found sequencing run %s" % SEQUENCING_RUN_NAME)
        break

if not sequencing_run_id:
    sequencing_run = {}
    sequencing_run["type"] = "local_BCLs"
    sequencing_run["name"] = SEQUENCING_RUN_NAME
    sequencing_run["status"] = "available"
    sequencing_run["cell_sets"] = []
    
    print("Creating sequencing run %s" % SEQUENCING_RUN_NAME)
    response = requests.post(sequencing_runs_URL, json=sequencing_run, verify=False)
    sequencing_run = response.json()
    sequencing_run_id = response.json()["_id"]

In [None]:
read_sets_URL = HOST_URL + "read_sets"
response = requests.get(read_sets_URL, verify=False)
read_sets = response.json()["read_sets"]

response = requests.get(cell_sets_URL, verify=False)
cell_sets = response.json()["cell_sets"]
sample_cell_sets = {}

for sample_name in new_sample_names:
    
    for cell_set in cell_sets:
        
        if cell_set["name"] == sample_name:
            sample_cell_sets[sample_name] = cell_set
            break

for sample_index, sample_name in enumerate(new_sample_names):
    
    read_set_id = None
    
    for read_set in read_sets:
        if read_set["name"] == sample_name:
            read_set_id = read_set["_id"]
            print("Found existing read set %s" % sample_name)
            break

    if not read_set_id:
        read_set = {}
        read_set["name"] = sample_name
        read_set["status"] = "available"
        
        print("Creating read set %s" % sample_name)
        
        response = requests.post(read_sets_URL, json=read_set, verify=False)
        read_set = response.json()
        read_set["sequencing_run_id"] = sequencing_run_id
        read_set["status"] = "available"
        read_set["cell_set_id"] = sample_cell_sets[sample_name]["_id"]
        
        response = requests.put(read_sets_URL + "/%s" % read_set["_id"], json=read_set, verify=False)
        read_set = response.json()
    
    print(read_set["path_id"])
    
    destination_files = os.popen('aws s3 ls s3://scrap-dm/reads/%s/' % read_set["path_id"]).read() # list all files found in an S3 folder
    destination_files = destination_files.split('\n') # create a list from string output
    destination_files = [x.split(' ')[-1] for x in destination_files if x.endswith('.fastq.gz')] # only get .gz elements and file names
    
    sample_source_files = []
    
    for file_name in source_files:
        file_sample_name = os.path.split(file_name)[1].split("_")
        file_sample_name = "_".join(file_sample_name[0:-4])
        if file_sample_name == sample_name:
            sample_source_files.append(file_name)
            
    print("Source files:")
    print(sample_source_files)
    print("Destination files:")
    print(destination_files)
    
    for source_file in sample_source_files:
        
        file_name = os.path.split(source_file)[1]
        new_file_name = file_name
        
        while new_file_name in destination_files:
            new_file_name_parts = new_file_name[0:-9].split("_")
            new_file_name = "_".join(new_file_name_parts[0:-1]) + "_%03d" % (int(new_file_name_parts[-1])+1) + ".fastq.gz"
        
        if new_file_name != file_name:
            print("Will rename %s to %s" % (file_name, new_file_name))
            
        source_path = "s3://%s/%s" % (S3_BUCKET, source_file)
        destination_path = "s3://scrap-dm/reads/%s/%s" % (read_set["path_id"], new_file_name)
        print("Moving %s to %s" % (source_path, destination_path))
        os.system('aws s3 mv %s %s' % (source_path, destination_path))
        read_set["FASTQ_files"][new_file_name] = {"remote_path" : new_file_name, "status" : "available"}
        response = requests.put(read_sets_URL + "/%s" % read_set["_id"], json=read_set, verify=False)

In [None]:
os.system("aws s3 mv --recursive s3://%s/%s s3://%s/sequencing_runs/%s/" % (S3_BUCKET, SEQUENCING_RUN_DIRECTORY, S3_BUCKET, sequencing_run["path_id"]))