# Exploring the reference database - Kmer counting
Let's start by counting the kmers :) 

## TODO
 - count all .fna not only the first one
 - split into multiple windows
 - split into slidable windows (smaller windows ?)

## Structure
In `genome/`, there's multiple sub-folder, we will start with `Bacteria`
It then contains all recorded species/strands in individual folders


## Content of each species/strand folder
In each folder there's:
- .ASN with 
 - `taxname "Acetobacter pasteurianus IFO 3283-32"`
 - `db "taxon", tag id 634457`
 - `genus "Acetobacter", species "pasteurianus"`
 - `mod { {subtype strain, subname "IFO 3283" }, { subtype substrain, subname "IFO 3283-32" } },`
 - `lineage "Bacteria; Proteobacteria; Alphaproteobacteria; Rhodospirillales; Acetobacteraceae; Acetobacter",`
- .FAA
 - with multiple ">gi|384064451|ref|YP_005479409.1| hypothetical protein APA32_44160 [Acetobacter pasteurianus IFO 3283-32]"
 - and probably the amino-acid sequence for each of these proteins
- .FFN
 - multiple ">gi|384064450|ref|NC_017102.1|:c562-116 Acetobacter pasteurianus IFO 3283-32 plasmid pAPA32-040, complete sequence"
 - probably DNA sequence
- .FNA
 - Also DNA
- .GBK : Human readable format with most info !
 - have an identifier `/db_xref="taxon:634457"`
- .GFF with `##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=634457`
- .RPT
 - seem good with simple Python INI config file format: 
   - `DNA  length = 3035`
   - `Taxname: Acetobacter pasteurianus IFO 3283-32`
   - `Taxid: 634457`


http://defindit.com/readme_files/ncbi_file_extension_format.html

What we need is the taxo id, name, and the DNA, which can be found in:
 - .gbk for the taxo and name
 - .fna for the sequence

#### File marker
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly <br>
`NC_	Genomic	Complete genomic molecule, usually reference assembly`

#### Status
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_status_codes/?report=objectonly <br>
in `COMMENT` : VALIDATED > REVIEWED > PROVISIONAL > ...


## Coding
### Import and Paths

In [1]:
import os
import pandas as pd
import configparser
import pickle
import traceback
from Bio import SeqIO
from time import time
from tqdm import tqdm_notebook as tqdm

In [2]:
path_ref_db = "/mnt/genomeDB/ncbi/genomes/Bacteria/"
path_kmer_freq = "/home/sjriondet/Data/Kmer_frequencies/"

## Functions

Counting kmer frequencies

In [3]:
nucleotides = "ACGT"

In [4]:
def read_fna(file_path):
    with open(file_path) as f:
        rec = f.readlines()
        return "".join(rec[1:]).replace("\n", "")

In [5]:
def combinaisons(combi, n, instances=nucleotides):
    if n == 1:
        return combi
    else:
        return [f"{a}{n}" for a in combinaisons(combi, n-1) for n in instances]

In [6]:
def kmers_dic(n, choice=nucleotides):
    return {a:0 for a in combinaisons(choice, n)}

In [7]:
def count_kmers(seq, kmer_template, n, bacteria_name, fna, w=100):
    """ Count all kmers, ignore kmers with N or other undecided nucleotides 
        Return a list of dict for each window (w=100)
    """
    res = []
    current_split = 0
    next_split = current_split + w
    tmp_counts = kmer_template.copy()
    tmp_counts["start"] = current_split
    
    try:
        for i, kmer in enumerate(window(seq, n)):
            try:
                tmp_counts[kmer] += 1
            except:
                pass
            # To lower the computational need to split into windows
            if i == next_split:
                res.append(tmp_counts)
                current_split = next_split
                next_split += w
                tmp_counts = kmer_template.copy()
                tmp_counts["start"] = current_split
                
        return i+1, res
    except Exception as e:
        print("type error: " + str(e))
        print(traceback.format_exc())
        return i, res

In [8]:
def window(fseq, window_size=53):
    for i in range(len(fseq) - window_size + 1):
        yield fseq[i:i+window_size]

In [9]:
def kmer_pkl_path(kmer_folder, fna_path):
    """ Return a file name based on the taxonomy id instead of the file name.
        The taxo id is looked for in the associated .gbk file.
    """
    path_gbk = fna_path.replace(".fna", ".gbk")
    assert os.path.isfile(path_gbk), f"{fna_path} DOESN'T have a .gbk file ??"
    
    with open(path_gbk) as gbk:
        description=gbk.read()  #.replace('\n', '')
        
    identificator = 'db_xref="taxon:'
    taxo_start = description.find(identificator)
    taxo = description[taxo_start+len(identificator):
                       taxo_start+description[taxo_start:].find('"\n')]
    assert len(taxo) < 10, f"The taxo id search failed, found an id of length {len(taxo)}..."
    
    bacteria_name = os.path.split(os.path.split(fna_path)[0])[1]
    fna_name      = os.path.split(os.path.splitext(fna_path)[0])[1]
    
    out_path = os.path.join(path_kmer_freq, kmer_folder, f"{taxo}__{bacteria_name}.pd")
    
    return taxo, bacteria_name, fna_name, out_path

In [10]:
def kmer_freq_to_file(kmer_dic, freq_path):
    with open(freq_path, 'wb') as f_out:
        pickle.dump(kmer_dic, f_out)

In [15]:
def to_pandas(bac_kmers):
    df = pd.DataFrame(bac_kmers)
    df_cols = df.columns.to_list()
    df = df.reindex(columns=df_cols[-3:] + df_cols[:-3])
    df.bacteria = df.bacteria.astype("category")
    df.fna = df.fna.astype("category")
    for col in df.columns:
        if col not in ["bacteria", "fna", "start"]:
            df[col] = df[col].astype("uint8")
    return df

In [12]:
def normalise_counts(kmer_count):
    """Nested dict, normalize each of the sub dict. """
    for window in kmer_count.keys():
        max_val = max(kmer_count[window].values())
        for key in kmer_count[window]:
            kmer_count[window][key] /= max_val

## Loop through all bacteria and retrieve the kmer spectrum

In [48]:
def count_all(scanning=path_ref_db, k=4, window=200):
    start = time()
    n = 0
    nucleotides = []
    failed = 0
    folder_kmers = "4_V2"   # Supposed to have 5242 files at the end
    dic_template = {"bacteria": "", "fna": "", "start": None,}
    dic_template.update(kmers_dic(k))
    
    # Looping through each bacterial folder
    for folder in tqdm(os.scandir(scanning), desc="Species", total=len(os.listdir(scanning))):
        if not os.path.isdir(folder): continue
        files = [f for f in os.scandir(folder) if f.name.endswith(".fna") and
                (f.name.startswith("NC_") or f.name.startswith("AC_") )]
        if len(files) == 0:           continue
        
        # Looping through each file for a single bacteria (multiple chromosomes or alternative genomes ?)
        bac_kmers = []
        for file_i in files:
            # Check if already done
            taxo, bacteria_name, fna_name, kmer_freq_path = kmer_pkl_path(folder_kmers, file_i.path)
            if os.path.isfile(kmer_freq_path):
                continue

#                     try:
            # Count
            rec = read_fna(file_i)    # go through all files
            dic_template["bacteria"] = bacteria_name
            dic_template["fna"] = fna_name
            success_n, kmer_counts = \
                count_kmers(rec, dic_template, k, bacteria_name, fna_name, w=window)
            succ_fail = "Success" if len(rec)-3 == success_n else "Fail   "
            print(f"{succ_fail} -> Bactera: {bacteria_name}, file: {fna_name}, len: {len(rec)}")
            nucleotides.append(success_n)
            
            # No need to normalise yet 
            # normalise_counts(kmer_counts)
            bac_kmers.extend(kmer_counts)
            

        if len(bac_kmers) > 0:
            # Pandas
            df = to_pandas(bac_kmers)
#             return bac_kmers

            # Save to a file
            df.to_pickle(kmer_freq_path)
            n += 1

#                     except:
#                         failed += 1
#                         print(f"Failed: {files[0].path}")

        if n + failed > 5400:  # 5400
            break
                    
    elapsed_time = time() - start
    total = sum(total_counted)
    print(f"\n{n+failed} species have been scanned\n Success: {n}, failed: {failed} \n"
          f"Took {elapsed_time:,.1f}s / {elapsed_time/60:.1f}min  to complete. {total/elapsed_time:,.0f} bp/s")
    return nucleotides

In [47]:
total_counted = count_all()

Success -> Bactera: Acholeplasma_laidlawii_PG_8A_uid58901, file: NC_010163, len: 1496992
Success -> Bactera: Achromobacter_xylosoxidans_A8_uid59899, file: NC_014640, len: 7013095
Success -> Bactera: Achromobacter_xylosoxidans_A8_uid59899, file: NC_014641, len: 98156
Success -> Bactera: Achromobacter_xylosoxidans_A8_uid59899, file: NC_014642, len: 247895
Success -> Bactera: Achromobacter_xylosoxidans_NBRC_15126_uid232243, file: NC_023061, len: 6683584
Success -> Bactera: Achromobacter_xylosoxidans_uid205255, file: NC_021285, len: 6916670
Success -> Bactera: Acidaminococcus_fermentans_DSM_20731_uid43471, file: NC_013740, len: 2329769
Success -> Bactera: Acidaminococcus_intestini_RyC_MR95_uid74445, file: NC_016077, len: 2487765
Success -> Bactera: Acidianus_hospitalis_W1_uid66875, file: NC_015518, len: 2137654
Success -> Bactera: Acidilobus_saccharovorans_345_15_uid51395, file: NC_014374, len: 1496453
Success -> Bactera: Acidimicrobidae_bacterium_YM16_304_uid193703, file: NC_020520, len: 



### End of the script.
Sylvain @GIS

## Tests

### Speed Tests

In [None]:
os.chdir(path_ref_db)

In [None]:
os.chdir("Acetobacter_pasteurianus_IFO_3283_32_uid158375")

In [None]:
rec = read_fna("NC_017102.fna")
len(rec)

### 4-mer

In [None]:
kmer_4 = kmers_dic(4)

In [None]:
kmer_4

In [None]:
%%timeit
count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
success_n = count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
success_n, counts = count_kmers(rec, kmer_4, 4)

In [None]:
%%timeit
success_n, counts = count_kmers(rec, kmer_4, 4, "test", "fna", w=100)

In [None]:
%%timeit
success_n, counts = count_kmers(rec, kmer_4, 4, "test", "fna", w=100)

In [None]:
%%timeit
kmer_4[max(kmer_4, key=kmer_4.get)]

In [None]:
%%timeit
max(kmer_4.values())

#### Checking paths 

In [None]:
fna_path = "/mnt/genomeDB/ncbi/genomes/Bacteria/Aciduliprofundum_boonei_T469_uid43333/NC_013926.gbk"
path_gbk = fna_path.replace(".fna", ".gbk")
with open(path_gbk) as gbk:
    description = gbk.read()
identificator = 'db_xref="taxon:'
taxo_start = description.find(identificator)
taxo = description[taxo_start+len(identificator):
                   taxo_start+description[taxo_start:].find('"\n')]

#### Testing deepcopy speed

In [None]:
from copy import deepcopy

In [None]:
dic_template = kmers_dic(4)

In [None]:
%%timeit
new_dic = deepcopy(dic_template)

In [None]:
%%timeit
new_dic = dic_template.copy()

#### First attempts on reading sequence

In [None]:
for f in os.scandir():
    if f.name.endswith("fna"):
        print(f"{f.name}\t{os.path.getsize(f):>10,d} bytes")

In [None]:
%%timeit
rec = SeqIO.read("NC_017102.fna", "fasta")

In [None]:
rec

In [None]:
mer2 = {f"{a}{b}":0 for a in nucleotides for b in nucleotides}

In [None]:
mer2 

#### DataFrame manipulation

In [None]:
df2 = df.copy()

In [None]:
for col in df2.columns:
    if col not in ["bacteria", "fna", "start"]:
        print(col)
        df2[col] = df2[col].astype("uint8")

In [None]:
df2

In [None]:
df.memory_usage()

In [None]:
df2.memory_usage()