In [1]:
import pandas as pd
import os
import os.path as osp
from sklearn.cluster import MiniBatchKMeans

In [2]:
# Methods for nucleotides manipulations
nucleotides = "ACGT"

def kmers_dic(n, choice=nucleotides):
    return {a: 0.0 for a in combinaisons(choice, n)}

def combinaisons(combi, n, instances=nucleotides):
    if n == 1:
        return combi
    else:
        return [f"{a}{n}" for a in combinaisons(combi, n-1) for n in instances]

In [3]:
path = "/mnt/data/Segmentation/k3_s50000/kmer_counts/all-counts.k3_s50000_oplant-vertebrate.csv"

In [4]:
dtype = {
    "taxon": int,
    "category": str,
    "start": int,
    "end": int,
    "name": str,
    "description": str,
    "fna_path": str,
}
for kmer in combinaisons(nucleotides, 4):
    dtype[kmer] = float

In [14]:
col_kmers = combinaisons(nucleotides, 3)

In [30]:
n_clusters = 10
chunksize = 100000

In [33]:
df = pd.read_csv(path, dtype=dtype, usecols=col_kmers)
ml_model = MiniBatchKMeans(n_clusters=n_clusters, random_state=3, batch_size=chunksize, max_iter=100)
ml_model.fit(df)
ml_model.cluster_centers_

array([[ 256.46865989,  490.24555789,  474.97327184,  266.64086442,
         387.72323787,  658.48490361, 1254.05935146,  248.5027098 ,
         369.64079649, 1042.79182078,  560.28870337,  248.34764723,
         161.83997826,  921.96710497,  591.30288643,  267.09322776,
         416.3720128 ,  772.77837744,  860.88774324,  592.27045938,
         569.91280325,  645.36647243, 1953.95136698,  558.83460395,
        1942.23868148, 3069.27787926, 1949.0111034 , 1253.84604701,
         139.65006567,  744.41352033,  862.78635588,  471.88883018,
         745.71564439, 1024.19874398,  746.15561359,  921.43227759,
        1143.20151417, 1839.62139008, 3066.92074395, 1043.1488957 ,
         584.7992331 , 1838.29288507,  643.01700608,  656.85185157,
         260.99570508, 1023.10768255,  771.89961655,  490.47216226,
          69.77322202,  261.55201461,  139.05296569,  161.85643333,
         541.47109796,  584.58938573, 1939.44115427,  368.25395148,
         540.82901073, 1142.528034  ,  570.64699

In [32]:
ml_model = MiniBatchKMeans(n_clusters=n_clusters, random_state=3, batch_size=chunksize, max_iter=100)
for partial_df in pd.read_csv(path, chunksize=chunksize, dtype=dtype, usecols=col_kmers):
    ml_model.partial_fit(partial_df)
ml_model.cluster_centers_

array([[2572.06545024,  852.54628882, 1262.14412778, 1848.38832822,
         826.25449432,  410.53222725,  376.92644922,  655.28074708,
        1112.46000861,  634.23740513,  585.80868185,  813.63917595,
        1518.99071532,  788.77656225, 1074.36909952, 1840.98016578,
        1079.31530222,  362.84382905,  582.36292319,  798.59867323,
         502.84814845,  174.20785564,  203.73580386,  371.68616449,
         381.70399376,  222.38671349,  238.04110824,  363.79163303,
         627.2397734 ,  314.99648797,  485.15718015,  876.27864794,
        1302.21852629,  361.89968513,  521.81018892, 1019.4227488 ,
         648.09791969,  256.33968459,  280.20357662,  581.9636552 ,
         644.64478713,  310.46602347,  310.78935895,  568.19129124,
         811.92285645,  319.50464234,  506.9558776 ,  921.2696862 ,
        1581.54942408,  691.70665805,  779.83640131, 1556.69682168,
         845.91672049,  411.39696701,  345.05822434,  694.73496959,
        1066.53919748,  599.51597233,  699.45734

In [None]:
chunksize = 1000
for partial_df in pd.read_csv(path, chunksize=chunksize, dtype=dtype):
    ml_model.partial_fit(partial_df)
ml_model.cluster_centers_

In [29]:
partial_df.sample()

Unnamed: 0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
990067,370,333,293,349,406,291,258,208,254,195,...,302,314,325,259,134,322,336,517,370,569


In [14]:
partial_df.dtypes[:10]

taxon           int64
category       object
start           int64
end             int64
name           object
description    object
fna_path       object
AAAA            int64
AAAC            int64
AAAG            int64
dtype: object

# Iter through all .pd files

In [52]:
from pathlib import Path
from tqdm.notebook import tqdm

In [35]:
path_counts = "/mnt/data/Segmentation/k3_s50000/kmer_counts/counts.k3_s50000/"

In [104]:
def counts_buffer(path_counts, chunksize=10000):
    buffer = []
    rows = 0
    
    for path in Path(path_counts).rglob("*/*.pd"):
        df = pd.read_pickle(path)
        rows += df.shape[0]
        buffer.append(df)

        if rows > chunksize:
            df_concat = pd.concat(buffer, ignore_index=True)
            yield df_concat.iloc[:chunksize,:]
            buffer = [df_concat.iloc[chunksize:, :]]
            rows -= chunksize

In [101]:
def counts_buffer(path_counts, chunksize=10000):
    buffer = []
    rows = 0
    
    for path in Path(path_counts).rglob("*/*.pd"):
        df = pd.read_pickle(path)
        add_rows = df.shape[0]
        
        if rows + add_rows > chunksize:
            split_row = chunksize - rows
            buffer.append(df.iloc[:split_row, :])
            yield pd.concat(buffer, ignore_index=True)
            buffer = [df.iloc[split_row:, :]]
            rows = add_rows - split_row
        
        else:
            buffer.append(df)
            rows += add_rows

In [95]:
chunksize = 10000

In [105]:
# %%timeit
i = 0
for df in tqdm(counts_buffer(path_counts, chunksize=chunksize)):
    i += 1
    if i > 5:
        break

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [106]:
df

Unnamed: 0,taxon,category,start,end,name,description,fna_path,AAA,AAC,AAG,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,395960,complete genome,5250000,5300000,NC_011004.1,|kraken:taxid|395960|s:5250000-e:5299999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,302,548,540,...,1709,508,672,961,703,429,144,789,611,303
1,395960,complete genome,5300000,5350000,NC_011004.1,|kraken:taxid|395960|s:5300000-e:5349999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,273,511,576,...,1701,449,612,1015,684,380,91,699,539,248
2,395960,complete genome,5350000,5400000,NC_011004.1,|kraken:taxid|395960|s:5350000-e:5399999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,254,532,538,...,1722,427,674,958,700,389,76,699,542,230
3,395960,complete genome,5400000,5450000,NC_011004.1,|kraken:taxid|395960|s:5400000-e:5449999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,218,476,465,...,1726,443,668,1037,716,389,88,641,535,209
4,395960,complete genome,5450000,5500000,NC_011004.1,|kraken:taxid|395960|s:5450000-e:5499999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,191,470,413,...,1823,537,665,1070,759,476,87,808,732,249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,429009,complete genome,0,50000,NC_013385.1,|kraken:taxid|429009|s:0-e:49999|NC_013385.1 A...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,661,527,1178,...,601,683,549,812,1287,274,425,826,548,707
9996,429009,complete genome,50000,100000,NC_013385.1,|kraken:taxid|429009|s:50000-e:99999|NC_013385...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,693,446,1105,...,570,774,472,930,1417,331,495,857,657,934
9997,429009,complete genome,100000,150000,NC_013385.1,|kraken:taxid|429009|s:100000-e:149999|NC_0133...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,791,505,1175,...,614,751,596,798,1132,358,480,897,633,970
9998,429009,complete genome,150000,200000,NC_013385.1,|kraken:taxid|429009|s:150000-e:199999|NC_0133...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,725,487,1170,...,557,657,500,822,1252,276,471,858,553,1002


In [102]:
# %%timeit
i = 0
for df in tqdm(counts_buffer(path_counts, chunksize=chunksize)):
    i += 1
    if i > 5:
        break

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [103]:
df

Unnamed: 0,taxon,category,start,end,name,description,fna_path,AAA,AAC,AAG,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,395960,complete genome,5250000,5300000,NC_011004.1,|kraken:taxid|395960|s:5250000-e:5299999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,302,548,540,...,1709,508,672,961,703,429,144,789,611,303
1,395960,complete genome,5300000,5350000,NC_011004.1,|kraken:taxid|395960|s:5300000-e:5349999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,273,511,576,...,1701,449,612,1015,684,380,91,699,539,248
2,395960,complete genome,5350000,5400000,NC_011004.1,|kraken:taxid|395960|s:5350000-e:5399999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,254,532,538,...,1722,427,674,958,700,389,76,699,542,230
3,395960,complete genome,5400000,5450000,NC_011004.1,|kraken:taxid|395960|s:5400000-e:5449999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,218,476,465,...,1726,443,668,1037,716,389,88,641,535,209
4,395960,complete genome,5450000,5500000,NC_011004.1,|kraken:taxid|395960|s:5450000-e:5499999|NC_01...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,191,470,413,...,1823,537,665,1070,759,476,87,808,732,249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,429009,complete genome,0,50000,NC_013385.1,|kraken:taxid|429009|s:0-e:49999|NC_013385.1 A...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,661,527,1178,...,601,683,549,812,1287,274,425,826,548,707
9996,429009,complete genome,50000,100000,NC_013385.1,|kraken:taxid|429009|s:50000-e:99999|NC_013385...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,693,446,1105,...,570,774,472,930,1417,331,495,857,657,934
9997,429009,complete genome,100000,150000,NC_013385.1,|kraken:taxid|429009|s:100000-e:149999|NC_0133...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,791,505,1175,...,614,751,596,798,1132,358,480,897,633,970
9998,429009,complete genome,150000,200000,NC_013385.1,|kraken:taxid|429009|s:150000-e:199999|NC_0133...,/ssd1500/NCBI/20190704/refseq/bacteria/GCF_000...,725,487,1170,...,557,657,500,822,1252,276,471,858,553,1002
