# Exploring the reference database - Kmer counting
Let's start by counting the kmers :) 

## TODO
 - count all .fna not only the first one
 - split into multiple windows
 - split into slidable windows (smaller windows ?)
 - splitin 10k

## Structure
In `genome/`, there's multiple sub-folder, we will start with `Bacteria`
It then contains all recorded species/strands in individual folders


## Content of each species/strand folder
In each folder there's:
- .ASN with 
 - `taxname "Acetobacter pasteurianus IFO 3283-32"`
 - `db "taxon", tag id 634457`
 - `genus "Acetobacter", species "pasteurianus"`
 - `mod { {subtype strain, subname "IFO 3283" }, { subtype substrain, subname "IFO 3283-32" } },`
 - `lineage "Bacteria; Proteobacteria; Alphaproteobacteria; Rhodospirillales; Acetobacteraceae; Acetobacter",`
- .FAA
 - with multiple ">gi|384064451|ref|YP_005479409.1| hypothetical protein APA32_44160 [Acetobacter pasteurianus IFO 3283-32]"
 - and probably the amino-acid sequence for each of these proteins
- .FFN
 - multiple ">gi|384064450|ref|NC_017102.1|:c562-116 Acetobacter pasteurianus IFO 3283-32 plasmid pAPA32-040, complete sequence"
 - probably DNA sequence
- .FNA
 - Also DNA
- .GBK : Human readable format with most info !
 - have an identifier `/db_xref="taxon:634457"`
- .GFF with `##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=634457`
- .RPT
 - seem good with simple Python INI config file format: 
   - `DNA  length = 3035`
   - `Taxname: Acetobacter pasteurianus IFO 3283-32`
   - `Taxid: 634457`


http://defindit.com/readme_files/ncbi_file_extension_format.html

What we need is the taxo id, name, and the DNA, which can be found in:
 - .gbk for the taxo and name
 - .fna for the sequence

#### File marker
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly <br>
`NC_	Genomic	Complete genomic molecule, usually reference assembly`

#### Status
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_status_codes/?report=objectonly <br>
in `COMMENT` : VALIDATED > REVIEWED > PROVISIONAL > ...


## Coding
### Import and Paths

In [2]:
import os
import pandas as pd
import configparser
import pickle
import traceback
from joblib import Parallel, delayed
from multiprocessing import cpu_count
from Bio import SeqIO
from time import time
from tqdm import tqdm_notebook as tqdm
import re

In [3]:
path_ref_db = "/home/ubuntu/Data/NCBI/20190704/refseq/"
path_kmer_freq = "/home/ubuntu/Data/kmer_freq/"

In [4]:
folder_kmers = "4mer/V4"   # Supposed to have 5242 files at the end

In [5]:
n_cores = cpu_count()

## Functions

Counting kmer frequencies

In [6]:
nucleotides = "ACGT"

In [7]:
def read_fna(file_path):
    with open(file_path) as f:
        rec = f.readlines()
        return "".join(rec[1:]).replace("\n", "")

In [8]:
def combinaisons(combi, n, instances=nucleotides):
    if n == 1:
        return combi
    else:
        return [f"{a}{n}" for a in combinaisons(combi, n-1) for n in instances]

In [9]:
def kmers_dic(n, choice=nucleotides):
    return {a:0 for a in combinaisons(choice, n)}

In [10]:
col_kmer = set(kmers_dic(4))

In [11]:
def count_kmers(seq, kmer_template, n, bacteria_name, fna, w=100):
    """ Count all kmers, ignore kmers with N or other undecided nucleotides 
        Return a list of dict for each window (w=100)
    """
    res = []
    current_split = 0
    next_split = current_split + w
    tmp_counts = kmer_template.copy()
    tmp_counts["start"] = current_split
    
    try:
        for i, kmer in enumerate(window(seq, n)):
            try:
                tmp_counts[kmer] += 1
            except:
                pass
            # To lower the computational need to split into windows
            if i == next_split:
                res.append(tmp_counts)
                current_split = next_split
                next_split += w
                tmp_counts = kmer_template.copy()
                tmp_counts["start"] = current_split
                
        return i+1, res
    except Exception as e:
        print("type error: " + str(e))
        print(traceback.format_exc())
        return i, res

In [12]:
def window(fseq, window_size=4):
    for i in range(len(fseq) - window_size + 1):
        yield fseq[i:i+window_size]

!! Need to add if it's the genome itself or a plasmid !!

In [13]:
p = "/home/ubuntu/Data/NCBI/20190704/refseq/bacteria/GCF_002334625.1/GCF_002334625.1_ASM233462v1_genomic.gff"

In [14]:
with open(p) as f:
    head = [next(f) for i in range(9)][-1]

In [15]:
description = head[-1]
identificator = 'Taxonomy/Browser/wwwtax.cgi?id='
taxo_start = description.find(identificator)
taxo = description[taxo_start+len(identificator):
                   taxo_start+description[taxo_start:].find('\n')]

In [16]:
taxo

''

In [24]:
def kmer_pkl_path(kmer_folder, fna_path, taxo_ext="gff"):
    """ Return a file name based on the taxonomy id instead of the file name.
        We retrieve the taxo id from the .gff file.
        To avoid re-reading file, taxo id is stored into <bac>.taxon
    """
    assert taxo_ext in ("gbk", "gff"), "Only extensions .gbk and .gff are implemented"
    
#     bacteria_name = os.path.split(os.path.split(fna_path)[0])[1]
    fna_name      = os.path.split(os.path.splitext(fna_path)[0])[1]
    
    taxo = ""
    path_taxon = fna_path.replace(".fna", ".taxon")
    if os.path.isfile(path_taxon):
        with open(path_taxon) as f:
            taxo = f.read()
    
    if not str.isdigit(taxo):
        path_gbk = fna_path.replace(".fna", f".{taxo_ext}")
        assert os.path.isfile(path_gbk), f"{fna_path} DOESN'T have a .{taxo_ext} file ??"

        with open(path_gbk) as gbk:
            description = [next(gbk) for i in range(9)][-1]

        if taxo_ext == "gbk":
            identificator = 'db_xref="taxon:'
        elif taxo_ext == "gff":
            identificator = 'Taxonomy/Browser/wwwtax.cgi?id='
        taxo_start = description.find(identificator)
        taxo = description[taxo_start+len(identificator):
                           taxo_start+description[taxo_start:].find('\n')]
            
        assert 1 <= len(taxo) <= 8, f"The taxo id search failed, found an id of length {len(taxo)}, \n" \
                               f"for the file: {path_gbk} \n" \
                               f"found string : {taxo[:min(50, len(taxo))]} ..."
        
        with open(path_taxon, "w") as f:
            f.write(taxo)
    
    query = taxo_table[(taxo_table.taxo == int(taxo)) & (taxo_table.class_name == "scientific name")]
    assert query.shape[0] == 1, f"Found {query.shape[0]} matches for the scientific name of taxo {taxo}. Display the taxo table: \n" \
                                f"{taxo_table[taxo_table.taxo == int(taxo)]}"
    bacteria_name = query.name.iat[0]
    
    formatted_bacteria = re.sub('[^A-Za-z0-9]+', '_', bacteria_name)
    out_path = os.path.join(path_kmer_freq, kmer_folder, f"{taxo}__{fna_name}__{formatted_bacteria}.pd")
    return taxo, bacteria_name, fna_name, out_path

In [18]:
# TODO: test if working !! added the genome/plasmid differentiator
def to_pandas(bac_kmers, bac):
    df = pd.DataFrame(bac_kmers)
    # put the file, fna and start at the beginning
    df_cols = df.columns.to_list()
    df = df.reindex(columns=df_cols[-4:] + df_cols[:-4])   # put the 4 non nucleotide col in the front bacteria, fna, len_genome, start
    # add label if genome or plasmid (longest genome size)
    genome_length = df[df.bacteria == bac].len_genome.unique().max()
    df["genome_plasmid"] = "NaN"
    df.loc[(df.bacteria == bac) & (df.len_genome == genome_length), 
               "genome_plasmid"] = "genome"
    df.loc[(df.bacteria == bac) & (df.len_genome != genome_length), 
               "genome_plasmid"] = "plasmid"
    # Try to reduce size of these files
    df.bacteria = df.bacteria.astype("category")
    df.fna = df.fna.astype("category")
    for col in col_kmer:
        df[col] = df[col].astype("uint16")
    return df

## Check taxonomy table (from Kraken)

In [19]:
path_taxo_names = "/home/ubuntu/Disks/SSD500/Segmentation/Kraken_10_clusters_V1/Kraken2_building/taxonomy/names.dmp"

In [21]:
taxo_table = pd.read_csv(path_taxo_names, sep="\t|\t")

  """Entry point for launching an IPython kernel.


In [22]:
taxo_table.head()

Unnamed: 0,1,|,all,|.1,Unnamed: 4,|.2,synonym,|.3
0,1,|,root,|,,|,scientific name,|
1,2,|,Bacteria,|,Bacteria <prokaryotes>,|,scientific name,|
2,2,|,Monera,|,Monera <Bacteria>,|,in-part,|
3,2,|,Procaryotae,|,Procaryotae <Bacteria>,|,in-part,|
4,2,|,Prokaryota,|,Prokaryota <Bacteria>,|,in-part,|


In [26]:
taxo_table.drop(["|", "|.1", "|.2", "|.3"], inplace=True, axis=1)

In [28]:
taxo_table.rename(columns={'1': 'taxo', 'all': 'name', 'Unnamed: 4': 'unique_name', 'synonym': 'class_name'}, inplace=True)

In [31]:
taxo_table.sample(5)

Unnamed: 0,taxo,name,unique_name,class_name
862977,562494,Ribautiana,,scientific name
1896551,1526535,Ceratobasidium sp. Rh 80,,scientific name
2740817,2304353,"Aloe capitata var. cipolinicola H.Perrier, 1926",,authority
2715526,2280178,Megaselia sp. BIOUG27050-H04,,scientific name
1594516,1273922,Pseudomonas sp. 110623_PC_C1_B7,,scientific name


In [35]:
taxo_table.to_pickle("/home/ubuntu/Disks/SSD500/NCBI/taxo_names.pd")

Wanted to add the missing taxon, but they id doesn't match with existing species in the NCBI database... <br>
1049581 <br>
1819728 <br>
1743172 <br>

In [None]:
rows_to_add = {
    "taxo": [1049581, 1819728, 1743172],
    "name": ["Bacillus altitudinis", "Polynucleobacter paneuropaeus", ""],
    "unique_name": [None, None, None],
    "class_name": ["scientific name", "scientific name", "scientific name", ],
}

In [None]:
taxo_table.append(pd.DataFrame(rows_to_add), ignore_index=True).tail(5)

In [22]:
taxo_table.tail(5)

Unnamed: 0,taxo,name,unique_name,class_name
3007869,2592274,"Mespilodaphne Nees & Mart. ex Nees, 1833",,authority
3007870,2592283,Mespilodaphne cymbarum,,scientific name
3007871,2592283,"Mespilodaphne cymbarum (Kunth) Trofimov, 2019",,authority
3007872,2592283,Ocotea cymbarum,,synonym
3007873,2592283,"Ocotea cymbarum Kunth, 1816",,authority


In [23]:
taxo_table[taxo_table.taxo == 2527775]

Unnamed: 0,taxo,name,unique_name,class_name
2954307,2527775,CIP 111323,CIP 111323 <type strain>,type material
2954308,2527775,DSM 103454,DSM 103454 <type strain>,type material
2954309,2527775,Polynucleobacter paneuropaeus,,scientific name
2954310,2527775,Polynucleobacter paneuropaeus Hoetzinger et al...,,authority
2954311,2527775,Polynucleobacter sp. FUKU-NW-11,,includes
2954312,2527775,Polynucleobacter sp. MG-25-Pas1-D2,,includes
2954313,2527775,Polynucleobacter sp. MWH-CNW20-3,,includes
2954314,2527775,Polynucleobacter sp. MWH-Creno-4B4,,includes
2954315,2527775,Polynucleobacter sp. MWH-UK1W16,,includes
2954316,2527775,Polynucleobacter sp. UB-Kaiv-W7,,includes


In [19]:
taxo_table = pd.read_pickle("/home/ubuntu/Disks/SSD500/NCBI/taxo_names.pd")

## Loop through all bacteria and retrieve the kmer spectrum

In [20]:
def extract_folder(folder, dic_template, ):

    if not os.path.isdir(folder): return
    files = [f for f in os.scandir(folder) if f.name.endswith(".fna") 
#                      and (f.name.startswith("NC_") or f.name.startswith("AC_"))
             and "multiisoloate" not in f.path and "multispecies" not in f.path]
    if len(files) == 0: return

    # Looping through each file for a single bacteria (multiple chromosomes or alternative genomes ?)
    bac_kmers = []
    for file_i in files:
        try:
            # Check if already done
            taxo, bacteria_name, fna_name, kmer_freq_path = \
                kmer_pkl_path(folder_kmers, file_i.path, taxo_ext="gff")
            if os.path.isfile(kmer_freq_path):
                return   # Already done for this folder

            # Count
            rec = read_fna(file_i)    # go through all files
            dic_template["bacteria"] = bacteria_name
            dic_template["fna"] = fna_name
            dic_template["len_genome"] = len(rec)
            success_n, kmer_counts = \
                count_kmers(rec, dic_template, k, bacteria_name, fna_name, w=window)
            succ_fail = "Success" if len(rec)-3 == success_n else "Fail   "
            print(f"{succ_fail} -> Bacteria: {bacteria_name},\t file: {fna_name},\t len: {len(rec)}")
            nucleotides.append(success_n)

            bac_kmers.extend(kmer_counts)
        except Exception as e:
            print("type error: " + str(e))
            print(traceback.format_exc())
            print(file_i.path)

    if len(bac_kmers) > 0:
        # Pandas
        df = to_pandas(bac_kmers, bacteria_name)
        # Save to a file
        df.to_pickle(kmer_freq_path)
        n += 1


In [21]:
def count_all(folder_kmers, scanning=path_ref_db, k=4, window=1000, stop=3, skip={}):
    start = time()
    n = 0
    nucleotides = []
    dic_template = {"bacteria": "", "fna": "", "start": None,}
    dic_template.update(kmers_dic(k))
    
    # Looping through each family folder
    for genera in tqdm(os.scandir(scanning), desc="Genera", total=len(os.listdir(scanning))):
        if stop > 0 and n > stop:  # 5400
            break
        # Looping through each bacterial folder
#         results = Parallel(n_jobs=n_cores)(delayed(extract_folder)(folder, dic_template, ) \
#             for folder in tqdm(os.scandir(genera), desc="Species", total=len(os.listdir(genera)), leave=False))
    
        for folder in tqdm(os.scandir(genera), desc=genera.name, total=len(os.listdir(genera)), leave=False):
            if stop > 0 and n > stop:  # 5400
                break
            if genera.name in skip: continue
            
            if not os.path.isdir(folder): continue
            files = [f for f in os.scandir(folder) if f.name.endswith(".fna") 
#                      and (f.name.startswith("NC_") or f.name.startswith("AC_"))
                     and "multiisoloate" not in f.path and "multispecies" not in f.path]
            if len(files) == 0: continue

            # Looping through each file for a single bacteria (multiple chromosomes or alternative genomes ?)
            bac_kmers = []
            for file_i in files:
                try:
                    # Check if already done
                    taxo, bacteria_name, fna_name, kmer_freq_path = \
                        kmer_pkl_path(folder_kmers, file_i.path, taxo_ext="gff")
                    if os.path.isfile(kmer_freq_path):
                        continue   # Already done for this folder

                    # Count
                    rec = read_fna(file_i)    # go through all files
                    dic_template["bacteria"] = bacteria_name
                    dic_template["fna"] = fna_name
                    dic_template["len_genome"] = len(rec)
                    success_n, kmer_counts = \
                        count_kmers(rec, dic_template, k, bacteria_name, fna_name, w=window)
                    succ_fail = "Success" if len(rec)-3 == success_n else "Fail   "
#                     print(f"{succ_fail} -> Bacteria: {bacteria_name},\t file: {fna_name},\t len: {len(rec)}")
                    nucleotides.append(success_n)

                    bac_kmers.extend(kmer_counts)
                except Exception as e:
                    print("type error: " + str(e))
#                     print(traceback.format_exc())
                    print(file_i.path)

            if len(bac_kmers) > 0:
                # Pandas
                df = to_pandas(bac_kmers, bacteria_name)
                # Save to a file
                df.to_pickle(kmer_freq_path)
                n += 1


                    
    elapsed_time = time() - start
    total = sum(nucleotides)
    print(f"\n{n} folders have been scanned\n"
          f"Took {elapsed_time:,.1f}s / {elapsed_time/60:.1f}min  to complete. {total/elapsed_time:,.0f} bp/s")
    return nucleotides

In [25]:
to_skip = {'archaea', 'bacteria', 'fungi', 'invertebrate', 'plant', 'protozoa', 
           'vertebrate_mammalian', 'vertebrate_other'}

In [26]:
total_counted = count_all(folder_kmers, stop=-1, window=10000, skip=to_skip)

HBox(children=(IntProgress(value=0, description='Genera', max=8, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='archaea', max=311, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='bacteria', max=15922, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='fungi', max=55, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='invertebrate', max=24, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='plant', max=64, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='protozoa', max=40, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='vertebrate_other', max=48, style=ProgressStyle(description_wi…

HBox(children=(IntProgress(value=0, description='viral', max=9291, style=ProgressStyle(description_width='init…

type error: Found 0 matches for the scientific name of taxo 10658. Display the taxo table: 
Empty DataFrame
Columns: [taxo, name, unique_name, class_name]
Index: []
/home/ubuntu/Data/NCBI/20190704/refseq/viral/GCF_000837025.1/GCF_000837025.1_ViralProj14062_genomic.fna
type error: Found 0 matches for the scientific name of taxo 114416. Display the taxo table: 
Empty DataFrame
Columns: [taxo, name, unique_name, class_name]
Index: []
/home/ubuntu/Data/NCBI/20190704/refseq/viral/GCF_000841645.1/GCF_000841645.1_ViralProj14331_genomic.fna
type error: Found 0 matches for the scientific name of taxo 354260. Display the taxo table: 
Empty DataFrame
Columns: [taxo, name, unique_name, class_name]
Index: []
/home/ubuntu/Data/NCBI/20190704/refseq/viral/GCF_000843105.1/GCF_000843105.1_ViralProj14281_genomic.fna
type error: Found 0 matches for the scientific name of taxo 227859. Display the taxo table: 
Empty DataFrame
Columns: [taxo, name, unique_name, class_name]
Index: []
/home/ubuntu/Data/NCBI/20

In [27]:
with open("/home/ubuntu/Disks/SSD500/TMP/total_counted.pkl", "wb") as f:
    pickle.dump(total_counted, f)

In [38]:
[genera.name for genera in os.scandir(path_ref_db)]

['archaea',
 'bacteria',
 'fungi',
 'invertebrate',
 'plant',
 'protozoa',
 'vertebrate_mammalian',
 'vertebrate_other',
 'viral']

In [21]:
taxo_table[taxo_table.taxo == 38]

Unnamed: 0,taxo,name,unique_name,class_name
173,38,"""Angiococcus disciformis"" (Thaxter 1904) Jahn ...",,authority
174,38,"""Cystobacter disciformis"" (Thaxter 1904) Brock...",,authority
175,38,ATCC 33172,ATCC 33172 <type strain>,type material
176,38,Angiococcus disciformis,,synonym
177,38,Angiococcus disciformis (Thaxter 1904) Hook et...,,authority
178,38,Archangium disciforme,,scientific name
179,38,Archangium disciforme (Thaxter 1904) Lang et a...,,authority
180,38,Cystobacter disciformis,,synonym
181,38,DSM 17051,DSM 17051 <type strain>,type material
182,38,DSM 52716,DSM 52716 <type strain>,type material


In [14]:
total_counted = count_all(folder_kmers, stop=-1, window=10000)

Success -> Bacteria: multiisoloate_uid216090, file: NC_022107, len: 1042853
Success -> Bacteria: Chlamydia_trachomatis_E_SW3_uid167483, file: NC_012631, len: 7502
Success -> Bacteria: Chlamydia_trachomatis_F_SW4_uid167484, file: NC_012625, len: 7493
Success -> Bacteria: Chlamydia_trachomatis_F_SW5_uid167485, file: NC_012626, len: 7471
Success -> Bacteria: multispecies_uid212977, file: NC_021821, len: 1861334
Success -> Bacteria: multispecies_uid212977, file: NC_021822, len: 1806995
Success -> Bacteria: multispecies_uid212977, file: NC_021834, len: 1702398
Success -> Bacteria: multispecies_uid212977, file: NC_021847, len: 3362228
Success -> Bacteria: multispecies_uid212977, file: NC_021848, len: 3331580
Success -> Bacteria: Chlamydophila_pneumoniae_AR39_uid57809, file: NC_002179, len: 1229853
Success -> Bacteria: Comamonas_testosteroni_CNB_2_uid62961, file: NC_010935, len: 91181
Success -> Bacteria: Desulfovibrio_hydrothermalis_AM13___DSM_14728_uid184831, file: NC_019953, len: 5328
Succ

In [21]:
sum(total_counted)

9578234988

In [13]:
if input("confirm : ") == "y":
    names = []
    files = []
    counts = []
    path_4mer = os.path.join(path_kmer_freq, folder_kmers)
    for f in os.scandir(path_4mer):
        if f.name.endswith(".pd") and not f.name.startswith("_") \
                and not "multiisoloate" in f.path and not "multispecies" in f.path:
            files.append(f.path)
            names.append(os.path.splitext(f.name)[0])
    print(f"{len(names)} files")

    if True:
        df_concat = []
        for file in tqdm(files):
            df_concat.append(pd.read_pickle(file))

        all_df = pd.concat(df_concat)

        all_df.bacteria = all_df.bacteria.astype("category")
        all_df.fna = all_df.fna.astype("category")

    path_all = os.path.join(path_4mer, "_all_bacteria_4mers.largepd")
    all_df.to_pickle(path_all)
    print(f"Dataset saved to {path_all} ")

confirm : y
2782 files

Dataset saved to /home/sjriondet/Data/Kmer_frequencies/4_V3/_all_bacteria_4mers.largepd 




### End of the script.
Sylvain @GIS

## Tests

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [76]:
test = "ATCGCATACTGATCGATACTGATCGATGAATCCCGGT"

In [100]:
vocabulary = set(kmers_dic(4))

In [102]:
cv = CountVectorizer(analyzer="char", lowercase=False, ngram_range=(4,4),
                    vocabulary=vocabulary)

In [103]:
data = cv.fit_transform([test]).toarray()

In [104]:
np.array(data)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [105]:
cv.get_feature_names()

['AAAA',
 'AAAC',
 'AAAG',
 'AAAT',
 'AACA',
 'AACC',
 'AACG',
 'AACT',
 'AAGA',
 'AAGC',
 'AAGG',
 'AAGT',
 'AATA',
 'AATC',
 'AATG',
 'AATT',
 'ACAA',
 'ACAC',
 'ACAG',
 'ACAT',
 'ACCA',
 'ACCC',
 'ACCG',
 'ACCT',
 'ACGA',
 'ACGC',
 'ACGG',
 'ACGT',
 'ACTA',
 'ACTC',
 'ACTG',
 'ACTT',
 'AGAA',
 'AGAC',
 'AGAG',
 'AGAT',
 'AGCA',
 'AGCC',
 'AGCG',
 'AGCT',
 'AGGA',
 'AGGC',
 'AGGG',
 'AGGT',
 'AGTA',
 'AGTC',
 'AGTG',
 'AGTT',
 'ATAA',
 'ATAC',
 'ATAG',
 'ATAT',
 'ATCA',
 'ATCC',
 'ATCG',
 'ATCT',
 'ATGA',
 'ATGC',
 'ATGG',
 'ATGT',
 'ATTA',
 'ATTC',
 'ATTG',
 'ATTT',
 'CAAA',
 'CAAC',
 'CAAG',
 'CAAT',
 'CACA',
 'CACC',
 'CACG',
 'CACT',
 'CAGA',
 'CAGC',
 'CAGG',
 'CAGT',
 'CATA',
 'CATC',
 'CATG',
 'CATT',
 'CCAA',
 'CCAC',
 'CCAG',
 'CCAT',
 'CCCA',
 'CCCC',
 'CCCG',
 'CCCT',
 'CCGA',
 'CCGC',
 'CCGG',
 'CCGT',
 'CCTA',
 'CCTC',
 'CCTG',
 'CCTT',
 'CGAA',
 'CGAC',
 'CGAG',
 'CGAT',
 'CGCA',
 'CGCC',
 'CGCG',
 'CGCT',
 'CGGA',
 'CGGC',
 'CGGG',
 'CGGT',
 'CGTA',
 'CGTC',
 'CGTG',
 

In [10]:
def kmer_freq_to_file(kmer_dic, freq_path):
    with open(freq_path, 'wb') as f_out:
        pickle.dump(kmer_dic, f_out)

### Speed Tests

In [None]:
os.chdir(path_ref_db)

In [None]:
os.chdir("Acetobacter_pasteurianus_IFO_3283_32_uid158375")

In [None]:
rec = read_fna("NC_017102.fna")
len(rec)

### 4-mer

In [None]:
kmer_4 = kmers_dic(4)

In [None]:
kmer_4

In [None]:
%%timeit
count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
success_n = count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
success_n, counts = count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
success_n, counts = count_kmers(rec, kmer_4, 4, "test", "fna", w=100)

In [None]:
%%timeit
success_n, counts = count_kmers(rec, kmer_4, 4, "test", "fna", w=100)

In [None]:
%%timeit
kmer_4[max(kmer_4, key=kmer_4.get)]

In [None]:
%%timeit
max(kmer_4.values())

#### Checking paths 

In [None]:
fna_path = "/mnt/genomeDB/ncbi/genomes/Bacteria/Aciduliprofundum_boonei_T469_uid43333/NC_013926.gbk"
path_gbk = fna_path.replace(".fna", ".gbk")
with open(path_gbk) as gbk:
    description = gbk.read()
identificator = 'db_xref="taxon:'
taxo_start = description.find(identificator)
taxo = description[taxo_start+len(identificator):
                   taxo_start+description[taxo_start:].find('"\n')]

#### Testing deepcopy speed

In [None]:
from copy import deepcopy

In [None]:
dic_template = kmers_dic(4)

In [None]:
%%timeit
new_dic = deepcopy(dic_template)

In [None]:
%%timeit
new_dic = dic_template.copy()

#### First attempts on reading sequence

In [None]:
for f in os.scandir():
    if f.name.endswith("fna"):
        print(f"{f.name}\t{os.path.getsize(f):>10,d} bytes")

In [None]:
%%timeit
rec = SeqIO.read("NC_017102.fna", "fasta")

In [None]:
rec

In [None]:
mer2 = {f"{a}{b}":0 for a in nucleotides for b in nucleotides}

In [None]:
mer2 

#### DataFrame manipulation

In [None]:
df2 = df.copy()

In [None]:
for col in df2.columns:
    if col not in ["bacteria", "fna", "start"]:
        print(col)
        df2[col] = df2[col].astype("uint8")

In [None]:
df2

In [None]:
df.memory_usage()

In [None]:
df2.memory_usage()