# core

> Base classes, functions and other objects used across the package.

In [105]:
#| default_exp core

In [106]:
#| hide
from ecutilities.ipython import nb_setup
from ecutilities.core import path_to_parent_dir
from fastcore.test import test_fail
from nbdev import nbdev_export, show_doc

In [107]:
#| hide
nb_setup()
NBS_ROOT = path_to_parent_dir('nbs')
assert NBS_ROOT.is_dir()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Set autoreload mode


In [108]:
#| export
import configparser
import json
import os
import re
import sys
import warnings
from ecutilities.core import is_type, validate_path, safe_path
from IPython.display import display, Markdown, HTML
from pathlib import Path
from pprint import pprint
from typing import Any, Optional

try: from google.colab import drive
except: pass

In [109]:
#| export
# Retrieve the package root
from metagentools import __file__
CODE_ROOT = Path(__file__).parents[0]
PACKAGE_ROOT = Path(__file__).parents[1]

This module includes all base classes, functions and other objects that are used across the package. It is imported by all other modules in the package.

`core` includes utility classes and functions to make it easier to work with the complex file systems adopted for the project, as well as base classes such as a file reader with additional functionality.

# Utility Classes and Functions

## Handling files and file structure

Utility classes to represent

In [131]:
#| export
class ProjectFileSystem:
    """Represent a project file system, return paths to key directories, provide methods to manage the file system.

    - Paths to key directories are based on whether the code is running locally or in the cloud.
    - First time it is used on a local computer, it must be registered as local and a project root path must be set.
    - A user configuration file is created in the user's home directory to store the project root path and whether the machine is local or not.

    >Technical note: `ProjectFileSystem` is a simpleton class
    """

    _instance = None
    _config_dir = '.metagentools'
    _config_fname = 'metagentools.cfg'
    _shared_project_dir = 'Metagenomics'
    
    def __new__(cls, *args, **kwargs):
        # Create instance if it does not exist yet
        if cls._instance is None:
            cls.home = Path.home().resolve()
            cls.p2config = cls.home / cls._config_dir / cls._config_fname
            cls._instance = super().__new__(cls)
        return cls._instance
    
    def __init__(
        self, 
        mount_gdrive:bool=True,  # True to mount Google Drive if running on Colab
        project_file:Path=None  # Path to the project file. If None, use the one saved in the config file
        ):
            self.is_colab = 'google.colab' in sys.modules       
            if self.is_colab and mount_gdrive:
                drive.mount('/content/gdrive')
                self.gdrive = Path('/content/gdrive/MyDrive')

            self.is_kaggle = 'kaggle_web_client' in sys.modules
            if self.is_kaggle:
                raise NotImplemented(f"ProjectFileSystem is not implemented for Kaggle yet")

            if not self.is_colab and not self.is_kaggle and not self.is_local:
                msg = """
                      Code does not seem to run on the cloud but computer is not registered as local
                      If you are running on a local computer, you must register it as local by running
                        `ProjectFileSystem().register_as_local()`
                      before you can use the ProjectFileSystem class.
                      """
                warnings.warn(msg, UserWarning)

            self._project_root = Path()
            if self.is_local:
                cfg = self.read_config()
                path_str = cfg.get('Infra', 'project_root', fallback=None)
                if path_str is None: 
                    msg = """
                    Project root is not yet set in config file.
                    To set it, use `ProjectFileSystem().set_project_root()`.
                    """
                    warnings.warn(msg)
                else:
                    self._project_root = Path(path_str)

    def __call__(self): return self.is_local

    def info(self):
        """Print basic info on the file system and the device"""
        print(f"Running {self.os} on {self.running_on}")
        print(f"Device's home directory: {self.home}")
        print(f"Project file structure:")
        print(f" - Root ........ {self.project_root} \n - Data Dir .... {self.data} \n - Notebooks ... {self.nbs}")
    
    def read_config(self):
        """Read config from the configuration file if it exists and return an empty config if does not"""
        cfg = configparser.ConfigParser()
        if self.p2config.is_file(): 
            cfg.read(self.p2config)
        else:
            cfg.add_section('Infra')
        return cfg

    def register_as_local(self):
        """Update the configuration file to register the machine as local machine"""
        cfg = self.read_config()
        os.makedirs(self.home/self._config_dir, exist_ok=True)
        cfg['Infra']['registered_as_local'] = 'True'
        with open(self.p2config, 'w') as fp:
            cfg.write(fp)
        return cfg

    def set_project_root(
        self, 
        p2project: str|Path   # string or Path to the project directory. Can be absolute or relative to home
        ):
        """Update the configuration file to set the project root"""
        # Build and validate the path to the project root
        if isinstance(p2project, str): 
            p2project = Path(p2project)
            if not p2project.is_absolute():
                p2project = self.home / p2project
        if not p2project.is_dir(): raise FileNotFoundError(f"{p2project.absolute()} does not exist")
        
        # Update the configuration file        
        cfg = self.read_config()
        os.makedirs(self.home/self._config_dir, exist_ok=True)
        cfg['Infra']['project_root'] = str(p2project.absolute())
        with open(self.p2config, 'w') as fp:
            cfg.write(fp)
        self._project_root = p2project
        print(f"Project Root set to {p2project.absolute()}")
        return cfg

    @property
    def os(self): return sys.platform

    @property
    def project_root(self):
        if self.is_local:
            return self._project_root
        elif self.is_colab:
            return self.gdrive / self._shared_project_dir
        elif self.is_kaggle:
            raise NotImplemented(f"ProjectFileSystem is not implemented for Kaggle yet")
        else:
            raise ValueError('Not running locally, on Colab or on Kaggle')

    @property
    def data(self): return self.project_root / 'data'

    @property
    def nbs(self): return self.project_root / 'nbs'        

    @property
    def p2config(self): return self.home / self._config_dir / self._config_fname
        
    @property
    def is_local(self):
        """Return `True` if the current machine was registered as a local machine"""
        cfg = self.read_config()
        return cfg['Infra'].getboolean('registered_as_local', False)

    @property
    def running_on(self):
        """Return the device on which this is run: local, colab, kaggle, ..."""
        if self.is_local: device = 'local computer'
        elif self.is_colab: device = 'colab'
        elif self.is_kaggle: device = 'kaggle'
        else: device = 'unknown cloud server'
        return device
    
    def readme(
        self, 
        dir_path:Path =None, # Path to the directory to inquire. If None, display readme file from project_root.
        ):
        """Display `readme.md` file or any other `.md` file in `dir_path`. 

        This provides a convenient way to get information on each direcotry content
        """
        if dir_path is None: 
            path = self.data
        elif validate_path(dir_path, path_type='dir'):
            path = dir_path
        else:
            raise ValueError(f"'dir_path' is not a directory: {dir_path.absolute()}")

        
        if path.is_relative_to(self.project_root):
            path_to_display = path.relative_to(self.project_root)
        else:
            path_to_display = path.absolute()
        display(HTML('<hr>'))
        # display(Markdown(f"ReadMe file for directory `{path.relative_to(self.project_root)}`:"))
        display(Markdown(f"ReadMe file for directory `{path_to_display}`:"))
        mdfiles = {p.stem: p for p in path.glob('*.md')}
        if mdfiles:
            mdfile = mdfiles.get('readme', None)
            if mdfile is None:
                mdfile = mdfiles.get(list(mdfiles.keys())[0])
            display(HTML('<hr>'))
            display(Markdown(filename=mdfile))
            display(HTML('<hr>'))
        else:
            print('No markdown file in this folder')
        

In [111]:
show_doc(ProjectFileSystem)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L55){target="_blank" style="float:right; font-size:smaller"}

### ProjectFileSystem

>      ProjectFileSystem (*args, **kwargs)

Represent a project file system, return paths to key directories, provide methods to manage the file system.

- Paths to key directories are based on whether the code is running locally or in the cloud.
- First time it is used on a local computer, it must be registered as local and a project root path must be set.
- A user configuration file is created in the user's home directory to store the project root path and whether the machine is local or not.

Technical note: this is a simpleton class

**Reference Project File System**:

This project adopts a unified file structure to make coding and colaboration easier. In addition, we can run the code locally (from a `project-root` directory) or in the cloud (colab, kaggle, others).

The unified file structure when running localy is:
```text
    project-root   
        |--- data
        |      |--- CNN_Virus_data  (all data from CNN Virus original paper)
        |      |--- saved           (trained and finetuned models, saved preprocessed datasets)
        |      |--- ....            (raw or pre-processed data from various sources, results, ... )  
        |      
        |--- nbs  (all reference and work notebooks)
        |      |--- cnn_virus
        |      |        |--- notebooks.ipynb
```

When running on *google colab*, it is assumed that a google drive is mounted on the colab server instance, and that this google drive root includes a shortcut named `Metagenomics` and pointing to the project shared directory. The project shared directory is accessible [here](/https://drive.google.com/drive/folders/134uei5fmt08TpzhmjG4sW0FQ06kn2ZfZ) if you are an authorized project member.

**`ProjectFileSystem` at work**:

If you use this class for the first time on a local computer, read the two **Important Notes** below.

In [112]:
pfs = ProjectFileSystem()

In [113]:
#| hide
if not pfs.is_local and not pfs.is_kaggle: 
    pfs.register_as_local()

Once created, the instance of `ProjectFileSystem` gives access to key directories' paths:

- `project root`: `Path` to the project root directory
- `data`: `Path` to the data directory
- `nbs`: `Path` to the notebooks directory

It also provides additional information regarding the computer on which the code is running:

- `os`: a string providing the name of the operating system the code is running on
- `is_colab`: True if the code is running on google colab
- `is_kaggle`: True if the code is running on kaggle server (NOT IMPLEMENTED YET)
- `is_local`: True if the code is running on a computer registered as local

In [116]:
for p in [pfs.project_root, pfs.data, pfs.nbs]:
    print(p)

/home/vtec/projects/bio/metagentools
/home/vtec/projects/bio/metagentools/data
/home/vtec/projects/bio/metagentools/nbs


In [118]:
print(f"OSL {pfs.os}")
print(f"Local: {pfs.is_local}, Colab: {pfs.is_colab}, Kaggle: {pfs.is_kaggle}")

OSL linux
Local: True, Colab: False, Kaggle: False


In [119]:
show_doc(ProjectFileSystem.info)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L109){target="_blank" style="float:right; font-size:smaller"}

### ProjectFileSystem.info

>      ProjectFileSystem.info ()

Print basic info on the file system and the device

In [58]:
pfs.info()

Running linux on local computer
Device's home directory: /home/vtec
Project file structure:
 - Root ........ /home/vtec/projects/bio/metagentools 
 - Data Dir .... /home/vtec/projects/bio/metagentools/data 
 - Notebooks ... /home/vtec/projects/bio/metagentools/nbs


In [120]:
show_doc(ProjectFileSystem.readme)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L194){target="_blank" style="float:right; font-size:smaller"}

### ProjectFileSystem.readme

>      ProjectFileSystem.readme (dir_path=None)

Display `readme.md` file or any other `.md` file in `dir_path`, to get information on the directory content

In [134]:
pfs.readme(Path('data_dev'))

ReadMe file for directory `/home/vtec/projects/bio/metagentools/nbs-dev/data_dev`:

### Data directory for `metagentools` development 
This directory includes all the data required to test and validate `metagentools` code.

```text
data_dev
 |--- paired_1seq_150bp
 |--- single_1seq_150bp
 |--- 50mer_ds_100_seq
 |--- 150mer_ds_100_seq
 |--- another_sequence.fa
 |--- ....           
 |--- jsondict-test.json
 |--- readme.md           
 |--- .... 
 |--- train_short     
 |--- val_short
 |--- weight_of_classes
     
```

>**Important Note 1**:
>
>When using the package on a local computer for the **first time**, you must register the computer as a local computer. Otherwise, `ProjectFileSystem` will raise an error. Once registered, the configuration file will be updated and `ProjectFileSystem` will detect that and run without error.

In [59]:
show_doc(ProjectFileSystem.register_as_local)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L125){target="_blank" style="float:right; font-size:smaller"}

### ProjectFileSystem.register_as_local

>      ProjectFileSystem.register_as_local ()

Update the configuration file to register the machine as local machine

In [60]:
cfg = pfs.register_as_local()

>**Important Note 2**:
>
>When using the package on a local computer for the **first time**, it is also required to *set the project root directory*. This is necessary to allow users to locate their local project folder anywhare. Once set, the path to the project root will be saved in the configuratin file.

In [61]:
show_doc(ProjectFileSystem.set_project_root)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L134){target="_blank" style="float:right; font-size:smaller"}

### ProjectFileSystem.set_project_root

>      ProjectFileSystem.set_project_root (p2project:str|pathlib.Path)

Update the configuration file to set the project root

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| p2project | str \| pathlib.Path | string or Path to the project directory. Can be absolute or relative to home |

In [62]:
pfs.set_project_root('/home/vtec/projects/bio/metagentools/');

Project Root set to /home/vtec/projects/bio/metagentools


In [63]:
show_doc(ProjectFileSystem.read_config)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L116){target="_blank" style="float:right; font-size:smaller"}

### ProjectFileSystem.read_config

>      ProjectFileSystem.read_config ()

Read config from the configuration file if it exists and return an empty config if does not

In [64]:
cfg = pfs.read_config()
cfg['Infra']['registered_as_local']

'True'

In [65]:
cfg['Infra']['project_root']

'/home/vtec/projects/bio/metagentools'

## Other utility classes

In [73]:
# | export
class JsonDict(dict):
    """Dictionary whose current value is mirrored in a json file and can be initated from a json file
    
    `JsonDict` requires a path to json file at creation. An optional dict can be passed as argument.

    Behavior at creation:
    
    - `JsonDict(p2json, dict)` will create a `JsonDict` with key-values from `dict`, and mirrored in `p2json`
    - `JsonDict(p2json)` will create a `JsonDict` with empty dictionary and load json content if file exists

    Once created, `JsonDict` instances behave exactly as a dictionary
    """
    def __init__(
        self, 
        p2json: str|Path,       # path to the json file to mirror with the dictionary 
        dictionary: dict =None  # optional dictionary to initialize the JsonDict
        ):
        """Create dict from a passed dict or from json. Create the json file if required"""
        self.p2json = Path(p2json) if isinstance(p2json, str) else p2json
        if dictionary is None:
            if self.p2json.is_file():
                dictionary = self.load()
                self.initial_dict_from_json = True
            else:
                dictionary = dict()
                self.initial_dict_from_json = False
        super().__init__(dictionary)
        self.save()
    
    def __setitem__(self, __k:Any, v:Any) -> None:
        super().__setitem__(__k, v)
        self.save()

    def __delitem__(self, k:Any):
        super().__delitem__(k)
        self.save()

    def __repr__(self):
        txt1 = super().__repr__()
        txt2 = f"\ndict mirrored in {self.p2json.absolute()}"
        return txt1 + txt2

    def load(self):
        with open(self.p2json, 'r') as fp:
            return json.load(fp)

    def save(self):
        with open(self.p2json, 'w') as fp:
            json.dump(self, fp, indent=4)



Create a new dictionary mirrored to a JSON file:

In [96]:
d = {'a': 1, 'b': 2, 'c': 3}
p2json = Path('data_dev/jsondict-test.json')
jsond = JsonDict(p2json, d)
jsond

{'a': 1, 'b': 2, 'c': 3}
dict mirrored in /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/jsondict-test.json

Once created, the `JsonFile` instance behaves exactly like a dictionary, with the added benefit that any change to the dictionary is automatically saved to the JSON file.

In [97]:
jsond['a'], jsond['b'], jsond['c']

(1, 2, 3)

In [98]:
for k, v in jsond.items():
    print(f"key: {k}; value: {v}")

key: a; value: 1
key: b; value: 2
key: c; value: 3


Adding or removing a value from the dictionary works in the same way as for a normal dictionary. But the json file is automatically updated.

In [99]:
jsond['d'] = 4
jsond

{'a': 1, 'b': 2, 'c': 3, 'd': 4}
dict mirrored in /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/jsondict-test.json

In [100]:
with open(p2json, 'r') as fp:
    print(fp.read())

{
    "a": 1,
    "b": 2,
    "c": 3,
    "d": 4
}


In [101]:
del jsond['a']
jsond

{'b': 2, 'c': 3, 'd': 4}
dict mirrored in /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/jsondict-test.json

In [102]:
with open(p2json, 'r') as fp:
    print(fp.read())

{
    "b": 2,
    "c": 3,
    "d": 4
}


In [None]:
#| export
class JsonFileReader:
    """Mirror a JSON file and a dictionary"""
    def __init__(self, 
                 path:str|Path # path to the json file
                ):
        self.path = safe_path(path)
        with open(path, 'r') as fp:
            self.d = json.load(fp)
    
    def add_item(self, 
                 key:str,  # key for the new item
                 item:dict # new item to add to the json as a dict
                ):
        self.d[key] = item
        return self.d

    def save_to_file(self, path=None):
        if path is None: 
            path = self.path
        else:
            path = safe_path(path)

        with open(path, 'w') as fp:
            json.dump(self.d, fp, indent=4)

In [None]:
show_doc(JsonFileReader)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L29){target="_blank" style="float:right; font-size:smaller"}

### JsonFileReader

>      JsonFileReader (path:str|pathlib.Path)

Mirror a JSON file and a dictionary

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| path | str \| pathlib.Path | path to the json file |

In [None]:
jd = JsonFileReader('data_dev/test.json')
pprint(jd.d)

{'item 1': {'keys': 'key key key key', 'pattern': 'pattern 1'},
 'item 2': {'keys': 'key key key key', 'pattern': 'pattern 2'},
 'item 3': {'keys': 'key key key key', 'pattern': 'pattern 3'}}


Now we can add an item to the dictionary/json 

In [None]:
new_item = {'keys': 'key key key key', 'pattern': 'another pattern'}
jd.add_item(key='another item', item=new_item)

{'item 1': {'keys': 'key key key key', 'pattern': 'pattern 1'},
 'item 2': {'keys': 'key key key key', 'pattern': 'pattern 2'},
 'item 3': {'keys': 'key key key key', 'pattern': 'pattern 3'},
 'another item': {'keys': 'key key key key', 'pattern': 'another pattern'}}

After saving the updated JSON file, we can load it again and see the changes.

In [None]:
jd.save_to_file()

In [None]:
jd = JsonFileReader('data_dev/test.json')
pprint(jd.d)

{'another item': {'keys': 'key key key key', 'pattern': 'another pattern'},
 'item 1': {'keys': 'key key key key', 'pattern': 'pattern 1'},
 'item 2': {'keys': 'key key key key', 'pattern': 'pattern 2'},
 'item 3': {'keys': 'key key key key', 'pattern': 'pattern 3'}}


In [None]:
# | hide
initial_json = {
    'item 1': {'keys': 'key key key key', 'pattern': 'pattern 1'},
    'item 2': {'keys': 'key key key key', 'pattern': 'pattern 2'},
    'item 3': {'keys': 'key key key key', 'pattern': 'pattern 3'}
    }
with open('data_dev/test.json', 'w') as fp:
    json.dump(initial_json, fp, indent=4)

# Base Classes

## File Readers

Base classes to be extended in order to create readers for specific file formats.

In [28]:
#| export
class TextFileBaseReader:
    """Iterator going through a text file by chunks of `nlines` lines. Iterator can be reset to file start.
    
    The class is mainly intented to be extended, as it is for handling sequence files of various formats such as `FastaFileReader`.
    """

    def __init__(
        self,
        path: str|Path,  # path to the file
        nlines: int=1,   # number of lines on one chunk
    ):
        self.path = safe_path(path)
        self.nlines = nlines
        self.fp = None
        self.reset_iterator()
        
        # Attributes related to metadata parsing
        # Currently assumes the iterator generates a dictionary with key/values
        # TODO: extend to iterator output as simple string.
        self.text_to_parse_key = None
        project_root = ProjectFileSystem().project_root
        self.parsing_rules_json = project_root / 'default_parsing_rules.json'
        # self.parsing_rules_json = Path(f"{PACKAGE_ROOT}/default_parsing_rules.json")
        self.re_rule_name = None
        self.re_pattern = None       # regex pattern to use to parse text
        self.re_keys = None          # keys (re group names) to parse text

    def reset_iterator(self):
        """Reset the iterator to point to the first line in the file."""
        if self.fp is not None:
            self.fp.close()
        self.fp = open(self.path, 'r')
        
    def __iter__(self):
        return self

    def _safe_readline(self):
        """Read a new line and handle end of file tasks."""
        line = self.fp.readline()
        if line == '':
            self.fp.close()
            raise StopIteration()
        return line

    def __next__(self):
        """Return one chunk of `nlines` text lines at the time"""
        lines = []
        for i in range(self.nlines):
            lines.append(self._safe_readline())
        return ''.join(lines)
    
    def print_first_chunks(
        self, 
        nchunks:int=3,  # number of chunks to print
    ):
        """Print the first `nchunk` chunks of text from the file.

        After printing, the iterator is reset again to its first line.
        """
        self.reset_iterator()
        for i, chunk in enumerate(self.__iter__()):
            if i > nchunks-1: break
            print(f"{self.nlines}-line chunk {i+1}")
            print(chunk)
        self.reset_iterator()
            
    def _parse_text_fn(
        self,
        txt:str,         # text to parse
        pattern:str,     # regex pattern to apply to parse the text
        keys:list[str],  # list of keys: keys are both the regex match group names and the corresponding output dict keys
    )-> dict:        # parsed metadata in key/value format
        """Basic parser function parsing metadata from string, using regex pattern. Return a metadata dictionary."""
        
        matches = re.match(pattern, txt)
        metadata = {}
        if matches is not None:
            for g in sorted(keys):
                m = matches.group(g)
                metadata[g] = m.replace('\t', ' ').strip() if m is not None else None
        
        else: 
            # TODO: review hack below, to avoid error when missing metadata such as 'species name'.
            # Current code tries to recover by saving the entire line in the fist key, expected to be the seqid or refseid
            # if txt:
            #     metadata[keys[0]] = txt.replace('\t', ' ').strip() if txt is not None else None
            # else:
            raise ValueError(f"No match on this line")
        return metadata

    def parse_text(
        self,
        txt:str,                    # text to parse
        pattern:str=None,           # If None, uses standard regex pattern to extract metadata, otherwise, uses passed regex
        keys:list[str]=None,        # If None, uses standard regex list of keys, otherwise, uses passed list of keys (str)
    )-> dict:                       # parsed metadata in key/value format
        """Parse text using regex pattern and keys. Return a metadata dictionary.
        
        The passed text is parsed using the regex pattern. The method return a dictionary in the format:

            {
                'key_1': 'metadata 1',
                'key_2': 'metadata 2',
                ...
            }
        
        """
        if pattern is None and keys is None:
            if self.re_pattern is not None and self.re_keys is not None:
                return self._parse_text_fn(txt, self.re_pattern, self.re_keys)
            else:
                raise ValueError('attribute re_pattern and re_keys are still None')
        elif pattern is None or keys is None:
            raise ValueError('pattern and keys must be either both None or both have a value')
        else:
            return self._parse_text_fn(txt, pattern, keys)
        
        
    def set_parsing_rules(
        self,
        pattern: str|bool=None,   # regex pattern to apply to parse the text, search in parsing rules json if None
        keys: list[str]=None,     # list of keys/group for regex, search in parsing rules json if None
        verbose: bool=False       # when True, provides information on each rule
    )-> None:
        """Set the standard regex parsing rule for the file.
        
        Rules can be set:
        
        1. manually by passing specific custom values for `pattern` and `keys`
        2. automatically, by testing all parsing rules saved in `parsing_rule.json` 
        
        Automatic selection of parsing rules works by testing each rule saved in `parsing_rule.json` on the first 
        definition line of the fasta file, and selecting the one rule that generates the most metadata matches.
        
        Rules consists of two parameters:
        
        - The regex pattern including one `group` for each metadata item, e.g `(?P<group_name>regex_code)`
        - The list of keys, i.e. the list with the name of each regex groups, used as key in the metadata dictionary
        
        This method updates the three following class attributes: `re_rule_name`, `re_pattern`, `re_keys`
      
        """
        # get the first definition line in the file to test the pattern
        # in base class, text_to_parse_key is not defined and automatic rule selection cannot be used
        # this must be handled in children classes
        if self.text_to_parse_key is None:
            msg = """
            `text_to_parse_key` is not defined in this class. 
            It is not possible to set a parsing rule.
            """
            warnings.warn(msg, category=UserWarning)
            return

        self.reset_iterator()
        first_output = next(self)
        text_to_parse = first_output[self.text_to_parse_key]
        divider_line = f"{'-'*80}"

        if pattern is not None and keys is not None:
            try:
                metadata_dict = self.parse_text(text_to_parse, pattern, keys)
                self.re_rule_name = 'Custom Rule'
                self.re_pattern = pattern
                self.re_keys = keys
                if verbose:
                    print(divider_line)
                    print(f"Custom rule was set for this instance.")
            except Exception as err: 
                raise ValueError(f"The pattern generates the following error:\n{err}")
                
        else:
            # Load all existing rules from json file
            with open(self.parsing_rules_json, 'r') as fp:
                parsing_rules = json.load(fp)
                
            # test all existing rules and keep the one with highest number of matches
            max_nbr_matches = 0
            for k, v in parsing_rules.items():
                re_pattern = v['pattern']
                re_keys = v['keys'].split(' ')
                try:
                    metadata_dict = self.parse_text(text_to_parse, re_pattern, re_keys)
                    nbr_matches = len(metadata_dict)
                    if verbose:
                        print(divider_line)
                        print(f"Rule <{k}> generated {nbr_matches:,d} matches")
                        print(divider_line)
                        print(re_pattern)
                        print(re_keys)

                    if len(metadata_dict) > max_nbr_matches:
                        self.re_pattern = re_pattern
                        self.re_keys = re_keys
                        self.re_rule_name = k    
                except Exception as err:
                    if verbose:
                        print(divider_line)
                        print(f"Rule <{k}> generated an error")
                        print(err)
                    else:
                        pass
            if self.re_rule_name is None:
                msg = """
        None of the saved parsing rules were able to extract metadata from the first line in this file.
        You must set a custom rule (pattern + keys) before parsing text, by using:
            `self.set_parsing_rules(custom_pattern, custom_list_of_keys)`
                """
                warnings.warn(msg, category=UserWarning)
            
            if verbose:
                print(divider_line)
                print(f"Selected rule with most matches: {self.re_rule_name}")

            # We used the iterator, now we need to reset it to make all lines available
            self.reset_iterator()


Once initialized, the iterator runs over each chunk of line(s) in the text file, sequentially.

In [29]:
p2textfile = Path('data_dev/train_short')
it = TextFileBaseReader(path=p2textfile, nlines=3)

one_iteration = next(it)

print(one_iteration)

TCAAAATAATCAGAAATGTTGAACCTAGGGTTGGACACATAATGACCAGC	76	0
ATTGTTTAACAATTTGTGCTCGTCCCGGTCACCCGCATCCAATCTTGATG	4	9
AATCTTGTCCTATCCTACCCGCAGGGGAATTGATGATAGANGTGCTTTTA	181	0



Let's create a new instance of the file reader, and get several iterations.

In [30]:
it = TextFileBaseReader(path=p2textfile, nlines=3)

one_iteration = next(it)
print(one_iteration)

TCAAAATAATCAGAAATGTTGAACCTAGGGTTGGACACATAATGACCAGC	76	0
ATTGTTTAACAATTTGTGCTCGTCCCGGTCACCCGCATCCAATCTTGATG	4	9
AATCTTGTCCTATCCTACCCGCAGGGGAATTGATGATAGANGTGCTTTTA	181	0



In [31]:
another_iteration = next(it)
print(another_iteration)
one_more_iteration = next(it)
print(one_more_iteration)

GGAGCGGAGCCAACCCCTATGCTCACTTGCAACCCAAGGGGCGTTCCAGT	74	3
TGGATCCTGCGCGGGACGTCCTTTGTCTACGTCCCGTCGGCGCATCCCGC	60	3
GAGAGACTTACTAAAAAGCTGGCACTTACCATCAGTGTTTCACCTACATG	44	0

ACACACGACACTAGAGATAATGTGTCAGTGGATTATAAACAAACCAAGTT	43	7
TTGTAGCATAAGAACTGGTCTTCGCTGAAATTCTTGTCTTGATCTCATCT	35	2
TGGCCCTGCGGTCTGGGGCCCAGAAGCATATGTCAAGTCCTTTGAGAAGT	73	4



If we want to access the start of the file again, we need to re-initialize the file handle.

In [32]:
show_doc(TextFileBaseReader.reset_iterator)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L245){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseReader.reset_iterator

>      TextFileBaseReader.reset_iterator ()

Reset the iterator to point to the first line in the file.

In [33]:
it.reset_iterator()
one_iteration = next(it)
print(one_iteration)
another_iteration = next(it)
print(another_iteration)

TCAAAATAATCAGAAATGTTGAACCTAGGGTTGGACACATAATGACCAGC	76	0
ATTGTTTAACAATTTGTGCTCGTCCCGGTCACCCGCATCCAATCTTGATG	4	9
AATCTTGTCCTATCCTACCCGCAGGGGAATTGATGATAGANGTGCTTTTA	181	0

GGAGCGGAGCCAACCCCTATGCTCACTTGCAACCCAAGGGGCGTTCCAGT	74	3
TGGATCCTGCGCGGGACGTCCTTTGTCTACGTCCCGTCGGCGCATCCCGC	60	3
GAGAGACTTACTAAAAAGCTGGCACTTACCATCAGTGTTTCACCTACATG	44	0



In [34]:
show_doc(TextFileBaseReader.print_first_chunks)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L269){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseReader.print_first_chunks

>      TextFileBaseReader.print_first_chunks (nchunks:int=3)

Print the first `nchunk` chunks of text from the file.

After printing, the iterator is reset again to its first line.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| nchunks | int | 3 | number of chunks to print |

In [35]:
it = TextFileBaseReader(path=p2textfile, nlines=3)

it.print_first_chunks(nchunks=3)

3-line chunk 1
TCAAAATAATCAGAAATGTTGAACCTAGGGTTGGACACATAATGACCAGC	76	0
ATTGTTTAACAATTTGTGCTCGTCCCGGTCACCCGCATCCAATCTTGATG	4	9
AATCTTGTCCTATCCTACCCGCAGGGGAATTGATGATAGANGTGCTTTTA	181	0

3-line chunk 2
GGAGCGGAGCCAACCCCTATGCTCACTTGCAACCCAAGGGGCGTTCCAGT	74	3
TGGATCCTGCGCGGGACGTCCTTTGTCTACGTCCCGTCGGCGCATCCCGC	60	3
GAGAGACTTACTAAAAAGCTGGCACTTACCATCAGTGTTTCACCTACATG	44	0

3-line chunk 3
ACACACGACACTAGAGATAATGTGTCAGTGGATTATAAACAAACCAAGTT	43	7
TTGTAGCATAAGAACTGGTCTTCGCTGAAATTCTTGTCTTGATCTCATCT	35	2
TGGCCCTGCGGTCTGGGGCCCAGAAGCATATGTCAAGTCCTTTGAGAAGT	73	4



In [36]:
show_doc(TextFileBaseReader.parse_text)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L308){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseReader.parse_text

>      TextFileBaseReader.parse_text (txt:str, pattern:str=None,
>                                     keys:list[str]=None)

Parse text using regex pattern and keys. Return a metadata dictionary.

The passed text is parsed using the regex pattern. The method return a dictionary in the format:

    {
        'key_1': 'metadata 1',
        'key_2': 'metadata 2',
        ...
    }

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| txt | str |  | text to parse |
| pattern | str | None | If None, uses standard regex pattern to extract metadata, otherwise, uses passed regex |
| keys | list | None | If None, uses standard regex list of keys, otherwise, uses passed list of keys (str) |
| **Returns** | **dict** |  | **parsed metadata in key/value format** |

In [37]:
text = '>2591237:ncbi:1'
pattern = r"^>(?P<id>\d+):(?P<source>ncbi):(?P<nb>\d*)"
keys = "id source nb".split(' ')

it.parse_text(text, pattern, keys)

{'id': '2591237', 'nb': '1', 'source': 'ncbi'}

## Extending the base class

`TextFileBaseReader` is a base class, intended to be extended into specific file format readers.

The following methods will typically be extended to match data file and other structured text files formats:

- `__next__` method in order to customize how the iterator parses files into "elements". For instance, in a FASTA file, one element consists of two lines: a *"definition line"* and the *sequence* itself. Extending `TextFileBaseReader` allows to read pairs of lines sequentially and return an element as a dictionary. For instance, `FastaFileReader` iterates over each pairs of lines in a Fasta file and return each pair as a dictionary as follows:

```text
    {
    'definition line': '>2591237:ncbi:1 [MK211378]\t2591237\tncbi\t1 [MK211378] '
                       '2591237\tCoronavirus BtRs-BetaCoV/YN2018D\t\tscientific '
                       'name\n',
    'sequence':        'TATTAGGTTTTCTACCTACCCAGGA'
    }
```
- Methods for parsing metadata from the file. For instance, `parse_file` method will handle how the reader will iterate over the full file and return a dictionary for the entire file. 
- Extended classes will also define a specific attributes (`text_to_parse_key`, `re_pattern`, `re_keys`, ...)

In [38]:
show_doc(TextFileBaseReader.set_parsing_rules)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#L336){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseReader.set_parsing_rules

>      TextFileBaseReader.set_parsing_rules (pattern:str|bool=None,
>                                            keys:list[str]=None,
>                                            verbose:bool=False)

Set the standard regex parsing rule for the file.

Rules can be set:

1. manually by passing specific custom values for `pattern` and `keys`
2. automatically, by testing all parsing rules saved in `parsing_rule.json` 

Automatic selection of parsing rules works by testing each rule saved in `parsing_rule.json` on the first 
definition line of the fasta file, and selecting the one rule that generates the most metadata matches.

Rules consists of two parameters:

- The regex pattern including one `group` for each metadata item, e.g `(?P<group_name>regex_code)`
- The list of keys, i.e. the list with the name of each regex groups, used as key in the metadata dictionary

This method updates the three following class attributes: `re_rule_name`, `re_pattern`, `re_keys`

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| pattern | str \| bool | None | regex pattern to apply to parse the text, search in parsing rules json if None |
| keys | list | None | list of keys/group for regex, search in parsing rules json if None |
| verbose | bool | False | when True, provides information on each rule |
| **Returns** | **None** |  |  |

> **Important Note to Developpers**
>
> Method `set_parsing_rules` is there to allow `TextFileBaseReader`'s descendant classes to automatically select parsing rule by applying rules saved in a json file to a string extracted from the first element in the file.
>
> It assumes that the iterator returns its elements as dictionaries `{section_name:section, ...}` and not as a pure string. The key `self.text_to_parse_key` will then be used to extract the text to parse for testing the rules.
> The base class iterator returns a simple string and `self.text_to_parse_key` is set to `None`.
>
> To make setting up a default parsing rule for the reader instance, the iterator must return a dictionary and `self.text_to_parse_key` must be set to the key in the dictionary corresponding the the text to parse. 
>
> See implementation in `FastaFileReader`.
>
> Calling `set_parsing_rules` on a class that does not satisfy with these characteristics will do nothing and return a warning.

In [39]:
it.set_parsing_rules()

            `text_to_parse_key` is not defined in this class. 
            It is not possible to set a parsing rule.
            


# Deprecated Items

When any of the following classes and functions is called, it will raise an exception with an error message indicating how to handle the required code refactoring.

Example:

```python
DeprecationWarning                        Traceback (most recent call last)
Input In [140], in <cell line: 1>()
----> 1 TextFileBaseIterator(p2textfile)

Input In [139], in TextFileBaseIterator.__init__(self, *args, **kwargs)
      4 def __init__(self, *args, **kwargs):
      5     msg = \"\"\"
      6     `TextFileBaseIterator` is deprecated. 
      7     Use `TextFileBaseReader` instead, with same capabilities and more.\"\"\"
----> 8     raise DeprecationWarning(msg)

DeprecationWarning: 
        `TextFileBaseIterator` is deprecated. 
        Use `TextFileBaseReader` instead, with same capabilities and more." 
```

In [139]:
#| export
class TextFileBaseIterator:
    """`TextFileBaseIterator` is a deprecated class, to be replaced by `TextFileBaseReader`"""
    def __init__(self, *args, **kwargs):
        msg = """
        `TextFileBaseIterator` is deprecated. 
        Use `TextFileBaseReader` instead, with same capabilities and more."""
        raise DeprecationWarning(msg)

In [136]:
#| hide
p2textfile = Path('data_dev/train_short')
test_fail(TextFileBaseIterator, msg="Should generate DeprecationWarning", contains="`TextFileBaseIterator` is deprecated.")

In [42]:
#| hide
nbdev_export()