In [49]:
import glob, os, time, sys
import json
import numpy as np
from collections import defaultdict
import argparse

from dataset.genome import GenomeDataset
from dataset.utils import load_meta_reads, create_document
import utils.utils as utils
from debug.visualize import get_group_label, visualize
from utils.metrics import genome_acc, group_precision_recall

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

from multiprocessing import Pool, cpu_count

from sklearn.cluster import KMeans

In [50]:
cores = cpu_count()
cores

8

In [31]:
sys.path.append('.')

DATASET_DIR = '../../Data/reads/'                   # Raw fasta data dir
BIMETAOUT_DIR = '../../Data/bimetaout/20210904/'    # bimeta output dir
DATASET_NAME = 'S1'                                 # Specifc fasta dataset or all of them
RESULT_DIR = '../../Data/doc2vecbimetaout/'    # Result dir

In [56]:
# Hyperparameters
KMERS = [4]
VECTOR_SIZE = 10
WINDOW_SIZE = 8
EPOCHS = 20
WORKERS = 2

In [27]:
# Mapping of dataset and its corresponding number of clusters
with open('config/dataset_metadata.json', 'r') as f:
    n_clusters_mapping = json.load(f)['datasets']

In [28]:
# Get some parameters
dataset_file = os.path.join(DATASET_DIR, DATASET_NAME + '.fna')
dataset_name = os.path.basename(dataset_file).split('.fna')[0]

print("-------------------------------------------------------")
print('Processing dataset: ', dataset_name)

-------------------------------------------------------
Processing dataset:  S1


In [29]:
n_clusters = n_clusters_mapping[dataset_name]
print('Prior number of clusters: ', n_clusters)

Prior number of clusters:  2


In [32]:
t0 = time.time()
# Load group file (phase 1 of bimeta) according to dataset_name
groups, seeds = utils.load_groups_seeds(BIMETAOUT_DIR, dataset_name)
print('Total number of groups: ', len(groups))
print('Time to load groups: ', (time.time() - t0))

Total number of groups:  152
Time to load groups:  0.01984119415283203


In [33]:
# Read fasta dataset
t1 = time.time()
reads, labels = load_meta_reads(dataset_file, type='fasta')
print('Total number of reads: ', len(labels))
print('Time to load reads: ', (time.time() - t1))

Total number of reads:  96367
Time to load reads:  5.307618618011475


In [34]:
t2 = time.time()
# Creating document from reads...
dict, docs = create_document(reads, KMERS)

# Tokenization of each document
#tokenized_docs = []
#for doc in docs:
#    tokenized_docs.append(word_tokenize(doc.lower()))

# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(docs)]

print('Time to create docs from reads: ', (time.time() - t2))

Time to create docs from reads:  3.607961893081665


In [57]:
t3 = time.time()
# Train doc2vec model
model = Doc2Vec(tagged_data, 
                vector_size = VECTOR_SIZE, 
                window = WINDOW_SIZE, 
                workers = WORKERS, 
                epochs = EPOCHS)

# Save trained doc2vec model
model_file = os.path.join(RESULT_DIR, dataset_name + '.doc2vec.model')
model.save(model_file)

# Load saved doc2vec model
#model= Doc2Vec.load(model_file)

print('Doc2Vec model training time: ', (time.time() - t3))

Doc2Vec model training time:  37.13066864013672


In [58]:
t4 = time.time()
print('Compute doc2vec feature ...')
ndocs = len(docs)
doc2vec = np.zeros((ndocs, VECTOR_SIZE))
for i in range(ndocs):
    doc2vec[i] = model.infer_vector(docs[i])

print('Compute doc2vec feature time: ', (time.time() - t4))

Compute doc2vec feature ...
Compute doc2vec feature time:  72.98422932624817


In [61]:
t5 = time.time()
print('Compute group feature ...')
ngroups = len(seeds)
doc2vec_group_features = np.zeros((ngroups, VECTOR_SIZE))
for i in range(ngroups):
    doc2vec_group_features[i] = np.mean([doc2vec[idx] for idx in seeds[i]])
print('Compute group feature time: ', (time.time() - t5))

Compute group feature ...
Compute group feature time:  0.03279876708984375


In [62]:
# Clustering groups
t6 = time.time()
print('Clustering ...')
kmeans = KMeans(
    init="random",
    n_clusters=n_clusters,
    n_init=100,
    max_iter=200,
    random_state=20210905)
kmeans.fit(X=doc2vec_group_features, y=labels)
y_pred_kmeans = kmeans.predict(X=doc2vec_group_features)
print('Clustering time: ', (time.time() - t6))

Clustering ...
Clustering time:  0.1766681671142578


In [63]:
# Map read to group and compute F-measure
t6 = time.time()
groupPrec = group_precision_recall(labels, groups, n_clusters)[0]
f1 = genome_acc(groups, y_pred_kmeans, labels, n_clusters)[2]
print('Compute measures: ')
print('Group precision: ', groupPrec)
print('F1-score: ', f1)
print('Total time: ', (time.time() - t0))

Compute measures: 
Group precision:  0.9898824286322081
F1-score:  0.7120773447036292
Total time:  2809.8840293884277
