In [1]:
#set cwd to /PiperNET
import os
os.chdir('..')

In [2]:
from pathlib import Path
from src.rnaseq_utils import get_config
import yaml

In [3]:
########## Load config.yaml (NEW) ##########
def get_config(config_path, **kwargs):
    
    #load config.yaml as dict
    with open(config_path, "r") as handle:
        config = yaml.safe_load(handle)
    
    #keep portion of config.yaml based on provided keys 
    try:
        for level in kwargs.values():
            config = config[level]
        
        #convert str paths to Path objects
        config = {k: Path(v) if isinstance(v, str) else v for k, v in config.items()}

        return config
    except KeyError:
        raise KeyError("Key(s) not found in config.yaml")

In [5]:
#load sample folder paths from config.yaml
config_path = Path('config/rnaseq_dataprep.yaml')
config = get_config(config_path, data='rna-seq', file='assembly')
config

{'piper09': PosixPath('data/rna-seq/assemblies/piper09/transcriptome.pep'),
 'piper10': PosixPath('data/rna-seq/assemblies/piper10/transcriptome.pep'),
 'piper12': PosixPath('data/rna-seq/assemblies/piper12/transcriptome.pep'),
 'piper20': PosixPath('data/rna-seq/assemblies/piper20/transcriptome.pep'),
 'piper23': PosixPath('data/rna-seq/assemblies/piper23/transcriptome.pep')}

In [6]:

import subprocess
import argparse
from src.rnaseq_utils import get_config
from pathlib import Path


config_path = Path('config/rnaseq_dataprep.yaml')

#load sample folder paths from config.yaml
assembly_paths = get_config(config_path, data='rna-seq', file='assembly')

#iterate over path dictionary
for input in assembly_paths.values():
    print(input)
    
    try:
        #i/o file paths
        output = Path(input).with_name('transcriptome_clstr.pep')
        print(output)

        #construct command
        min_identity = str(0.98)
        print(f'running CD-HIT on {input} with identity threshold: {min_identity}')
        command = f"cd-hit -i {input} -o {output} -c {min_identity} -n 5 -g 1 a"

        #run command
        subprocess.run(command, shell=True)

    #skip to next file if input is not found
    except FileNotFoundError:
        print(f'{input} not found. Skipping to next sample.')
        continue

data/rna-seq/assemblies/piper09/transcriptome.pep
data/rna-seq/assemblies/piper09/transcriptome_clstr.pep
running CD-HIT on data/rna-seq/assemblies/piper09/transcriptome.pep with identity threshold: 0.98
Program: CD-HIT, V4.8.1, Jan 21 2024, 14:55:34
Command: cd-hit -i
         data/rna-seq/assemblies/piper09/transcriptome.pep -o
         data/rna-seq/assemblies/piper09/transcriptome_clstr.pep
         -c 0.98 -n 5 -g 1 a

Started: Tue Feb 13 04:17:25 2024
                            Output                              
----------------------------------------------------------------

Usage: cd-hit [Options] 

Options

   -i	input filename in fasta format, required, can be in .gz format
   -o	output filename, required
   -c	sequence identity threshold, default 0.9
 	this is the default cd-hit's "global sequence identity" calculated as:
 	number of identical amino acids or bases in alignment
 	divided by the full length of the shorter sequence
   -G	use global sequence identity, default

In [None]:
data/rna-seq/assemblies/piper09/transcriptome.pep
data/rna-seq/assemblies/piper09/transcriptome.pep