# core

> This module is a placeholder and is a WIP that needs to be completed

In [None]:
#| default_exp core

In [None]:
#| hide
from ecutilities.ipython import nb_setup
from nbdev import nbdev_export, show_doc
from pprint import pprint

In [None]:
#| hide
nb_setup()

Set autoreload mode


In [None]:
#| export
import json
import re
from ecutilities.core import validate_path, validate_type
from pathlib import Path
from typing import Any, Optional

# Base iterators

In [None]:
#| export
class TextFileBaseIterator:
    """Iterator going through a text file by chunks of `nb_lines` lines"""
    def __init__(
        self, 
        path:str|Path,  # path to the file to read 
        nb_lines:int=1, # number of text lines in each text chunk 
    ):
        validate_path(path, raise_error=True)
        self.fp = open(path, 'r')
        self.nb_lines = nb_lines

    def __iter__(self):
        return self

    def _safe_readline(self):
        """Read a new line and handle end of file tasks"""
        line = self.fp.readline()
        if line == '':
            self.fp.close()
            raise StopIteration()
        return line

    def __next__(self):
        """Return one chunk at the time"""
        lines = []
        for i in range(self.nb_lines):
            lines.append(self._safe_readline())
        return ''.join(lines)
    
    def print_first_chuncks(self, nb_chunks=3):
        """Print the first few chuncks of text from the file"""
        for i, chunk in enumerate(self.__iter__()):
            if i > nb_chunks-1: break
            print(f"{self.nb_lines}-line chunk {i+1}")
            print(chunk)
        

In [None]:
show_doc(TextFileBaseIterator)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L14){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseIterator

>      TextFileBaseIterator (path:str|pathlib.Path, nb_lines:int=1)

Iterator going through a text file by chunks of `nb_lines` lines

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| path | str \| pathlib.Path |  | path to the file to read |
| nb_lines | int | 1 | number of text lines in each text chunk |

In [None]:
p2textfile = Path('data_dev/train_short')

it = TextFileBaseIterator(path=p2textfile, nb_lines=3)

it.print_first_chuncks(3)

3-line chunk 1
TCAAAATAATCAGAAATGTTGAACCTAGGGTTGGACACATAATGACCAGC	76	0
ATTGTTTAACAATTTGTGCTCGTCCCGGTCACCCGCATCCAATCTTGATG	4	9
AATCTTGTCCTATCCTACCCGCAGGGGAATTGATGATAGANGTGCTTTTA	181	0

3-line chunk 2
GGAGCGGAGCCAACCCCTATGCTCACTTGCAACCCAAGGGGCGTTCCAGT	74	3
TGGATCCTGCGCGGGACGTCCTTTGTCTACGTCCCGTCGGCGCATCCCGC	60	3
GAGAGACTTACTAAAAAGCTGGCACTTACCATCAGTGTTTCACCTACATG	44	0

3-line chunk 3
ACACACGACACTAGAGATAATGTGTCAGTGGATTATAAACAAACCAAGTT	43	7
TTGTAGCATAAGAACTGGTCTTCGCTGAAATTCTTGTCTTGATCTCATCT	35	2
TGGCCCTGCGGTCTGGGGCCCAGAAGCATATGTCAAGTCCTTTGAGAAGT	73	4

3-line chunk 4
TAGATTTAGTGGTTAGGTAGTAAGGCTACAATGTAAACACGTAGTGGCAA	11	6
AACCCCTGGGGCTATAAAAGGCGCGGTCTGTGCACGGGGACTTCGGTNGG	7	7
AGAATGGATAGTAAGGCAGACAGTAATAGGGGAGGCAATGAAGGAAACCA	9	2



This is a base class.

It is easy to override `__next__` method to customize how the iterator parses files.

For instance, the following class takes a fasta sequence file, extracts the definition line and the sequence, and return them as a dictionary (to keep the output clean, we only return the first 25 bases of the sequence:
```
    {
    'definition line': '>2591237:ncbi:1 [MK211378]\t2591237\tncbi\t1 [MK211378] '
                    '2591237\tCoronavirus BtRs-BetaCoV/YN2018D\t\tscientific '
                    'name\n',
    'sequence': 'TATTAGGTTTTCTACCTACCCAGGA'
    }
```

In [None]:
#| export
class FastaFileIterator(TextFileBaseIterator):
    def __next__(self):
        """Return one definition line and the corresponding sequence"""
        lines = []
        for i in range(2):
            lines.append(self._safe_readline())
        return {'definition line':lines[0], 'sequence':f"{lines[1]}"}
    def print_first_chuncks(self, nb_chunks=3):
        """Print the first few chuncks of text from the file"""
        for i, seq_dict in enumerate(self.__iter__()):
            print(f"\nSequence {i+1}:")
            print(seq_dict['definition line'])
            print(f"{seq_dict['sequence'][:80]} ...")
            if i >= nb_chunks: break

In [None]:
p2fasta = Path('data_dev/cov_virus_sequences_two.fa')

it = FastaFileIterator(p2fasta)
it.print_first_chuncks(3)


Sequence 1:
>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-BetaCoV/YN2018D		scientific name

TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAAT ...

Sequence 2:
>11128:ncbi:2 [LC494191]

CATCCCGCTTCACTGATCTCTTGTTAGATCTTTTCATAATCTAAACTTTATAAAAACATCCACTCCCTGTAGTCTATGCC ...


In [None]:
#| hide
nbdev_export()