# Convert non CoV fasta to standard fasta file

## Setup

In [None]:
try:
    from google.colab import drive
    ON_COLAB = True
    print('The notebook is running on colab')
    !pip install ecutilities
except:
    ON_COLAB = False
    print('The notebook is running locally')

The notebook is running locally


In [None]:
from ecutilities.ipython import nb_setup

In [None]:
nb_setup()

Set autoreload mode


In [None]:
from metagentools.art import ArtIllumina
from metagentools.core import TextFileBaseIterator
from metagentools.cnn_virus.data import FastaFileIterator, FastaFileReader
from pathlib import Path

# Define Paths

In [None]:
p2nCoV = Path('../../../data/ncov_data').resolve()
assert p2nCoV.is_dir()

species = 'rhinolophus_ferrumequinum'
p2dna = p2nCoV / f"{species}/dna"
assert p2dna.is_dir()

# Process fasta.gz and fasta

In [None]:
p2fa = p2dna/'Rhinolophus_ferrumequinum.mRhiFer1_v1.p.dna_rm.primary_assembly.1.fa'
assert p2fa.is_file()

In [None]:
fa = FastaFileReader(p2fa)
fa.it.print_first_chuncks()


Sequence 1:
>1 dna_rm:primary_assembly primary_assembly:mRhiFer1_v1.p:1:1:124933378:1 REF

CTGAGGGATTTNNNNNNNNNNNNNNNNNNNNNNNNTCTATGATATATTTCTAGGGTTAGG ...

Sequence 2:
CTTAGAAGGAGAAAGGGAGGCACTATCTCAAAGAAAGCAGATGAGGACACAGATGCAGGG

AGAAAAGTAAACGCTGTAAATGATTCTGGGGTTTGTGTATGTGTAGAGGAAAACATCCAT ...

Sequence 3:
CCATCCATTTCTTCTTCGTCGTCCATGGCAATCAAGTTCTGTGACTACTTTCTGCTAAAT

GTCCCCGATGTACATCTCTTTCTCTACCTTTCTGCTACCATCCCCCTAGGCAAGAACATC ...

Sequence 4:
AGCACTTCACAGCTAGGTGACTGTAACAGTCTCCTTGCTGTTTCGACTAAGAACTATTCT

TCCTAGTCAAACTAAATACCCTACGAGACTAACTGTAATTAAACCTTGTCATCCCCACAA ...


This fasta file has a problem. It only includes one sequence, but it inserts `\n` within the sequence. Needs preprocessing to eliminate all the `\n` in the sequence

## Cleanup LR from sequence

In [None]:
it = TextFileBaseIterator(p2fa)
it.print_first_chuncks()

1-line chunk 1
>1 dna_rm:primary_assembly primary_assembly:mRhiFer1_v1.p:1:1:124933378:1 REF

1-line chunk 2
CTGAGGGATTTNNNNNNNNNNNNNNNNNNNNNNNNTCTATGATATATTTCTAGGGTTAGG

1-line chunk 3
CTTAGAAGGAGAAAGGGAGGCACTATCTCAAAGAAAGCAGATGAGGACACAGATGCAGGG



In [None]:
p2cleanfa = p2dna / f"{p2fa.stem}.clean.fa"
print(p2cleanfa)
if p2cleanfa.is_file(): p2cleanfa.unlink()

with open(p2cleanfa, 'a') as fp:
    for i, line in enumerate(TextFileBaseIterator(p2fa)):
        if line[0] != '>' and '\n' in line: 
            fp.write(line[:-1])
        else:
            fp.write(line)
#         if i> 5: break

/home/vtec/projects/bio/metagentools/data/ncov_data/rhinolophus_ferrumequinum/dna/Rhinolophus_ferrumequinum.mRhiFer1_v1.p.dna_rm.primary_assembly.1.clean.fa


`>1 dna:primary_assembly primary_assembly:mRhiFer1_v1.p:1:1:124933378:1 REF`

In [None]:
pattern = r'^>\d[\s\t](?P<seq_type>dna):(?P<id_type>[\w\_]*)[\s\w](?P=id_type):(?P<assy>[\w\d\_]*)\.(?P<seq_level>[\w]*):\d*:\d*:(?P<taxonomy>\d*):(?P<id>\d*)[\s\t]REF$'

In [None]:
# TEMP - TODO - Replaced by function from ecutil package

def show_subdirectory_tree(path, pattern=''):
    """Print the tree of what is in path: subdiretories and their content"""
    parents = [p.name for p in path.parents]
    paths = []
    pad = ' ' * 2
    idx = 0
    print(f"{parents[0]}")
    print(f"{pad}|--{path.name}")
    for f in [p for p in path.glob(f"*{pattern}*") if p.is_file()]:
        paths.append(f)
        print(f"{pad}|{pad*2}|--{f.name} ({idx})")
        idx += 1
    for d in [p for p in path.glob(f"*{pattern}*") if p.is_dir()]:
        for f in d.iterdir():
            paths.append(f)
            print(f"{pad}|{pad*2}|--{f.name} ({idx})")
            idx += 1
    return paths
    
files = show_subdirectory_tree(p2dna, pattern='clean')

rhinolophus_ferrumequinum
  |--dna
  |    |--Rhinolophus_ferrumequinum.mRhiFer1_v1.p.dna_rm.primary_assembly.1.clean.fa (0)
