# Exploring the reference database
Let's see what properties we can find :)

## Structure
In `genome/`, there's multiple sub-folder, we will start with `Bacteria`
It then contains all recorded species/strands in individual folders


## Content of each species/strand folder
In each folder there's:
- .ASN with 
 - `taxname "Acetobacter pasteurianus IFO 3283-32"`
 - `db "taxon", tag id 634457`
 - `genus "Acetobacter", species "pasteurianus"`
 - `mod { {subtype strain, subname "IFO 3283" }, { subtype substrain, subname "IFO 3283-32" } },`
 - `lineage "Bacteria; Proteobacteria; Alphaproteobacteria; Rhodospirillales; Acetobacteraceae; Acetobacter",`
- .FAA
 - with multiple ">gi|384064451|ref|YP_005479409.1| hypothetical protein APA32_44160 [Acetobacter pasteurianus IFO 3283-32]"
 - and probably the amino-acid sequence for each of these proteins
- .FFN
 - multiple ">gi|384064450|ref|NC_017102.1|:c562-116 Acetobacter pasteurianus IFO 3283-32 plasmid pAPA32-040, complete sequence"
 - probably DNA sequence
- .FNA
 - Also DNA
- .GBK : Human readable format with most info !
 - have an identifier `/db_xref="taxon:634457"`
- .GFF with `##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=634457`
- .RPT
 - seem good with simple Python INI config file format: 
   - `DNA  length = 3035`
   - `Taxname: Acetobacter pasteurianus IFO 3283-32`
   - `Taxid: 634457`


http://defindit.com/readme_files/ncbi_file_extension_format.html

What we need is the taxo id, name, and the DNA, which can be found in:
 - .gbk for the taxo and name
 - .fna for the sequence

#### File marker
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly <br>
`NC_	Genomic	Complete genomic molecule, usually reference assembly`

#### Status
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_status_codes/?report=objectonly <br>
in `COMMENT` : VALIDATED > REVIEWED > PROVISIONAL > ...


## Coding
### Import and Paths

In [1]:
import os
import pandas as pd
import numpy as np
import configparser
import pickle
from Bio import SeqIO
from time import time
from tqdm import tqdm_notebook as tqdm

In [2]:
path_ref_db = "/mnt/genomeDB/ncbi/genomes/Bacteria/"
path_kmer_freq = "/home/sjriondet/Data/Kmer_frequencies/"

In [3]:
os.chdir(path_ref_db)

## Functions

In [11]:
def normalise_counts(kmer_count):
    """Nested dict, normalize each of the sub dict. """
    for window in kmer_count.keys():
        max_val = max(kmer_count[window].values())
        for key in kmer_count[window]:
            kmer_count[window][key] /= max_val

## Tests

In [4]:
path_4mer = "4_V2/"
path_4mer = os.path.join(path_kmer_freq, path_4mer)

In [24]:
names = []
files = []
counts = []
for f in os.scandir(path_4mer):
    if f.name.endswith(".pd"):
        with open(f, 'rb') as file:
            files.append(f.path)
            names.append(os.path.splitext(f.name)[0])
#             counts.append(pickle.load(file))099
print(f"{len(names)} files")

In [26]:
df = pd.read_pickle(files[0])

In [67]:
df["len_genome"] = 0
genomes_max = df[["fna", "start"]].groupby(by=["fna"]).max()
for fna in df.fna.unique():
    df.loc[df["fna"] == fna, ['len_genome']] = genomes_max.start[fna]

In [61]:
df["pack1000"] = df.start // 1000

In [68]:
df

Unnamed: 0,bacteria,fna,start,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,...,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT,len_genome,pack1000
0,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,0,0,1,1,1,1,2,0,...,0,1,2,2,1,1,0,1,6283800,0
1,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,200,0,1,3,1,2,1,1,...,0,1,1,0,1,0,1,2,6283800,0
2,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,400,1,1,1,2,1,0,1,...,2,0,1,0,1,2,2,1,6283800,0
3,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,600,5,1,4,2,0,1,1,...,2,0,1,2,3,1,2,0,6283800,0
4,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,800,0,0,0,1,1,1,2,...,1,0,1,0,0,0,1,0,6283800,0
5,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,1000,3,0,3,0,0,0,1,...,1,1,1,0,4,1,0,4,6283800,1
6,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,1200,3,0,2,2,0,0,1,...,0,1,0,0,0,0,0,0,6283800,1
7,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,1400,0,0,0,1,0,0,1,...,1,2,1,0,1,1,2,4,6283800,1
8,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,1600,2,1,2,5,0,0,1,...,2,1,0,1,1,1,0,0,6283800,1
9,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,1800,2,1,2,0,0,1,0,...,0,2,1,2,0,1,4,3,6283800,1


In [27]:
kmers_list = df.columns.to_list()[3:]

In [104]:
cols_spe = sorted(list(set(df.columns.to_list()).difference(set(kmers_list))))

In [107]:
df = df.reindex(columns=cols_spe + kmers_list)

In [109]:
df.head(10)

Unnamed: 0,bacteria,fna,len_genome,pack1000,start,AAAA,AAAC,AAAG,AAAT,AACA,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,0,0,0,1,1,1,1,...,0,0,0,1,2,2,1,1,0,1
1,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,0,200,0,1,3,1,2,...,1,0,0,1,1,0,1,0,1,2
2,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,0,400,1,1,1,2,1,...,2,2,2,0,1,0,1,2,2,1
3,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,0,600,5,1,4,2,0,...,2,3,2,0,1,2,3,1,2,0
4,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,0,800,0,0,0,1,1,...,1,0,1,0,1,0,0,0,1,0
5,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,1,1000,3,0,3,0,0,...,1,0,1,1,1,0,4,1,0,4
6,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,1,1200,3,0,2,2,0,...,1,1,0,1,0,0,0,0,0,0
7,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,1,1400,0,0,0,1,0,...,1,1,1,2,1,0,1,1,2,4
8,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,1,1600,2,1,2,5,0,...,3,0,2,1,0,1,1,1,0,0
9,Chamaesiphon_minutus_PCC_6605_uid183005,NC_019697,6283800,1,1800,2,1,2,0,0,...,1,1,0,2,1,2,0,1,4,3


In [110]:
df1000 = df.groupby(["fna", "pack1000"]).sum()

In [122]:
df1000[df1000.index == ("NC_019697", 5231)]

KeyError: False

In [115]:
df1000 = df[["fna", "pack1000"] + kmers_list].groupby(["fna", "pack1000"]).sum()

In [120]:
df1000.

Unnamed: 0_level_0,Unnamed: 1_level_0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
fna,pack1000,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
NC_019697,0,6.0,4.0,9.0,7.0,5.0,5.0,5.0,3.0,9.0,8.0,...,6.0,5.0,5.0,2.0,6.0,4.0,6.0,4.0,6.0,4.0
NC_019697,1,10.0,2.0,9.0,8.0,0.0,1.0,4.0,3.0,5.0,8.0,...,7.0,3.0,4.0,7.0,3.0,3.0,6.0,4.0,6.0,11.0
NC_019697,2,8.0,4.0,6.0,9.0,5.0,3.0,6.0,5.0,8.0,2.0,...,2.0,3.0,5.0,3.0,2.0,4.0,11.0,4.0,5.0,12.0
NC_019697,3,4.0,0.0,3.0,6.0,2.0,0.0,3.0,2.0,1.0,6.0,...,2.0,2.0,5.0,4.0,7.0,6.0,9.0,1.0,4.0,3.0
NC_019697,4,11.0,5.0,5.0,10.0,6.0,7.0,3.0,8.0,3.0,4.0,...,5.0,4.0,3.0,8.0,10.0,1.0,8.0,4.0,10.0,7.0
NC_019697,5,3.0,4.0,7.0,9.0,1.0,3.0,3.0,2.0,0.0,6.0,...,7.0,3.0,7.0,5.0,5.0,2.0,7.0,8.0,7.0,8.0
NC_019697,6,9.0,6.0,5.0,9.0,2.0,2.0,5.0,3.0,5.0,4.0,...,4.0,9.0,6.0,7.0,1.0,9.0,10.0,8.0,11.0,18.0
NC_019697,7,12.0,8.0,6.0,15.0,4.0,6.0,5.0,4.0,1.0,10.0,...,6.0,9.0,8.0,4.0,5.0,4.0,5.0,7.0,4.0,4.0
NC_019697,8,11.0,5.0,7.0,12.0,6.0,4.0,2.0,4.0,5.0,4.0,...,3.0,2.0,4.0,6.0,3.0,2.0,4.0,3.0,6.0,4.0
NC_019697,9,7.0,2.0,3.0,9.0,1.0,4.0,2.0,2.0,4.0,2.0,...,6.0,7.0,10.0,5.0,4.0,7.0,3.0,5.0,2.0,3.0


In [None]:
for file in files:
    

In [None]:
df = pd.DataFrame(counts, index=names)

In [None]:
df

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
pca = PCA(n_components=20)
pca.fit(df)

In [None]:
plt.scatter(x=range(len(pca.explained_variance_ratio_)+1), 
            y=np.insert(pca.explained_variance_ratio_.cumsum(), 0, 0))
# plt.plot(pca.explained_variance_ratio_)
plt.show()
print(f"captured by PCA: {sum(pca.explained_variance_ratio_)*100:0.1f}%")

In [None]:
pca = PCA(n_components=2)
pca.fit(df)

In [None]:
t2 = pca.transform(df)

In [None]:
df_2 = pd.DataFrame(t2, index=names, columns=["pca_1", "pca_2"])

In [None]:
df_2

In [None]:
df_2.plot.scatter(x="pca_1", y="pca_2")
plt.show()

## Machine Learning classification

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [None]:
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def scale_minmax(df, single_col=False):
    df = df + 1
    df = df.apply(np.log2)
    if single_col:
        return MinMaxScaler().fit_transform(df.values.reshape(-1, 1))
    else:
        return MinMaxScaler().fit_transform(df)

In [None]:
def error_and_corr(model, display=True, re_val=False):
    prediction = model.predict(X_test)
    pearson = np.corrcoef(prediction, y_test)[0, 1]
    mean_square_err = mean_squared_error(y_test, prediction)
    if re_val:
        return pearson, mean_square_err
    if display:
        print(f"Pearson correlation\t: *{pearson:.3f}*")
        print(f"Mean squared error\t: {mean_square_err:.3f}")
    return prediction

In [None]:
def some_predictions(pred):
    print("Expected values \t: " + "\t".join([f"{n:.2f}" for n in y_test[:10]]))
    print("Predicted values \t: " + "\t".join([f"{n:.2f}" for n in pred[:10]]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, ic50, test_size=0.1, random_state=0)

In [None]:
verbose = True

In [None]:
def linReg():
    if verbose:  print("Linear Regression")
    l_regr = linear_model.LinearRegression()
    l_regr.fit(X_train, y_train)
    return l_regr

In [None]:
l_regr = linReg()
pred = error_and_corr(l_regr)
some_predictions(pred)

In [None]:
def randForest():
    if verbose:  print("Random Forest Regression")
    rf_regr = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=100, n_jobs=6)
    rf_regr.fit(X_train, y_train)
    return rf_regr

In [None]:
rf_regr = randForest()
rf_pred = error_and_corr(rf_regr)
some_predictions(rf_pred)

In [None]:
def svr():
    if verbose:  print("Support Vector Machine regression")
    svr_rbf = SVR(kernel='rbf', gamma='auto', cache_size=1000)
    svr_rbf.fit(X_train, y_train)
    return svr_rbf

In [None]:
svr_rbf = svr()
svr_pred = error_and_corr(svr_rbf)
some_predictions(svr_pred)

In [None]:
def k_neigh():
    if verbose:  print("K neighbours")
    neigh = KNeighborsRegressor(n_neighbors=20, n_jobs=6)
    neigh.fit(X_train, y_train)
    return neigh

In [None]:
neigh = k_neigh()
n_pred = error_and_corr(neigh)
some_predictions(n_pred)

In [None]:
def elas_net():
    if verbose:  print("Elastic Net")
    elastic_net = ElasticNet(l1_ratio=0.5, random_state=0)
    elastic_net.fit(X_train, y_train)
    return elastic_net

In [None]:
e_net = elas_net()
e_pred = error_and_corr(e_net)
some_predictions(e_pred)

In [None]:
def nn():
    if verbose:  print("Neural Network")
    nnm = MLPRegressor(hidden_layer_sizes=(100,100,), verbose=False, tol=0.000100)
    nnm.fit(X_train, y_train)
    return nnm

In [None]:
nn_m = nn()
nn_pred = error_and_corr(nn_m)
some_predictions(nn_pred)

In [None]:
models = [linReg, randForest, svr, k_neigh, elas_net, nn]

In [None]:
results = {}
for model in tqdm(models):
    print("******************************************")
    m = model()
    pred = error_and_corr(m)
    # some_predictions(pred)
    pearson, err = error_and_corr(m, re_val=True)
    results[model.__name__] = {"pearson": pearson, "err": err}

In [None]:
pred = error_and_corr(models[1](), display=False)

In [None]:
pd.DataFrame([pred, y_test.values])

In [None]:
from datetime import datetime
file_results = "results.csv"
file_results = osp.join(folder, file_results)

In [None]:
with open(file_results, "a") as f:
    f.write(str(datetime.now())[:16] + "," + str(param) + "," + "\n")
    f.write(",".join(results.keys()) + "\n")
    f.write(",".join([f"{v['pearson']:.3f}" for v in results.values()]) + "\n")
    f.write(",".join([f"{v['err']:.3f}" for v in results.values()]) + "\n")
    print(f"Results written in {file_results}")



### End of the script.
Sylvain @GIS

## Keep other methods

In [None]:
def window(fseq, window_size=53):
    for i in range(len(fseq) - window_size + 1):
        yield fseq[i:i+window_size]

In [None]:
def kmer_pkl_path(k, fna_path):
    path_gbk = fna_path.replace(".fna", ".gbk")
    assert os.path.isfile(path_gbk), f"{fna_path} DOESN'T have a .gbk file ??"
    
    with open(path_gbk) as gbk:
        description=gbk.read()  #.replace('\n', '')
        
    identificator = 'db_xref="taxon:'
    taxo_start = description.find(identificator)
    taxo = description[taxo_start+len(identificator):
                       taxo_start+description[taxo_start:].find('"\n')]
    assert len(taxo) < 10, f"The taxo id search failed, found an id of length {len(taxo)}..."
    
    # TODO: ADD full path of the original file in the file name, or maybe in the .pkl
    
    return os.path.join(path_kmer_freq, str(k), taxo + ".pkl")

In [None]:
def kmer_freq_to_file(kmer_dic, freq_path):
    with open(freq_path, 'wb') as f_out:
        pickle.dump(kmer_dic, f_out)