# Set up environment

In [None]:
!pip install biopython
!git clone https://github.com/networkx/networkx-metis.git &> /dev/null
%cd networkx-metis
!python setup.py build &> /dev/null
!python setup.py install &> /dev/null

/content/networkx-metis


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.environ['PYTHONPATH'] = ('PYTHONPATH:/content/drive/MyDrive/metagenomic-binning/nlp-bimeta')
!export | grep PYTHONPATH

declare -x PYTHONPATH="PYTHONPATH:/content/drive/MyDrive/metagenomic-binning/nlp-bimeta"


In [2]:
import os
os.getcwd()

'/content'

In [3]:
#%cd ..
%cd drive/MyDrive/metagenomic-binning/nlp-bimeta

/content/drive/MyDrive/metagenomic-binning/nlp-bimeta


# Begin lda-bimeta

In [4]:
import glob, os, time, sys
import json
import numpy as np
from collections import defaultdict
import argparse

from dataset.genome import GenomeDataset
from dataset.utils import load_meta_reads, create_document, create_corpus
import utils.utils as utils
from debug.visualize import get_group_label, visualize
from utils.metrics import genome_acc, group_precision_recall

from sklearn.cluster import KMeans

In [5]:
sys.path.append('.')

DATASET_DIR = '../data/input/'                   # Raw fasta data dir
BIMETAOUT_DIR = '../data/output/bimetaout/'    # bimeta output dir
DATASET_NAME = 'S1'                                 # Specifc fasta dataset or all of them
RESULT_DIR = '../data/output/ldabimetaout/'    # Result dir

In [6]:
# Hyperparameters
KMERS = [4]
IS_TFIDF = False
MALLET_BINARY = '../mallet/bin/mallet'
N_TOPICS = [3]  # for test
#N_TOPICS = [10, 15, 20, 25, 30]  # for productions

In [7]:
# Mapping of dataset and its corresponding number of clusters
with open('config/dataset_metadata.json', 'r') as f:
    n_clusters_mapping = json.load(f)['datasets']

In [8]:
# Get some parameters
dataset_file = os.path.join(DATASET_DIR, DATASET_NAME + '.fna')
dataset_name = os.path.basename(dataset_file).split('.fna')[0]

print("-------------------------------------------------------")
print('Processing dataset: ', dataset_name)

-------------------------------------------------------
Processing dataset:  S1


In [9]:
n_clusters = n_clusters_mapping[dataset_name]
print('Prior number of clusters: ', n_clusters)

Prior number of clusters:  2


In [10]:
t0 = time.time()
# Load group file (phase 1 of bimeta) according to dataset_name
groups, seeds = utils.load_groups_seeds(BIMETAOUT_DIR, dataset_name)
print('Total number of groups: ', len(groups))
print('Time to load groups: ', (time.time() - t0))

Total number of groups:  152
Time to load groups:  0.04249691963195801


In [11]:
# Read fasta dataset
t1 = time.time()
reads, labels = load_meta_reads(dataset_file, type='fasta')
print('Total number of reads: ', len(labels))
print('Time to load reads: ', (time.time() - t1))

Total number of reads:  96367
Time to load reads:  6.665858268737793


In [12]:
t2 = time.time()
# Creating document from reads...
dictionary, documents = create_document(reads, KMERS)

# Creating corpus...
corpus = create_corpus(dictionary, documents, is_tfidf=IS_TFIDF)
print('Time to create corpus from reads: ', (time.time() - t2))

Time to create corpus from reads:  17.134848594665527


In [13]:
t3 = time.time()
print('LDA training ...')
n_topics_choices = len(N_TOPICS)
lda_models = [None] * n_topics_choices
top_dist_arrays = [None] * n_topics_choices
max_coherence = -1
max_index = -1
print('Number of topic choices: {}'.format(n_topics_choices))
for i in range(n_topics_choices):
    print('LDA training for {} topics ...'.format(N_TOPICS[i]))
    #lda_models[i] = utils.do_LDA_Mallet(MALLET_BINARY, corpus, dictionary, \
    #n_topics=N_TOPICS[i], n_workers=2, n_passes=10, max_iters=200)
    lda_models[i] = utils.do_LDA_Multicore(corpus, dictionary, \
    n_topics=N_TOPICS[i], n_workers=2, n_passes=1, max_iters=50)
    print('LDA model training time for {} topics: {}'.format(N_TOPICS[i], (time.time() - t3)))
    
    # get coherence value
    print('Compute coherence value ...')
    coherence = utils.getCoherenceScore(lda_models[i], corpus)
    print('Coherence value for {} topics: {}'.format(N_TOPICS[i], coherence))
    
    if (max_coherence < coherence):
        max_index = i
        max_coherence = coherence
    
    t4 = time.time()
    print('Getting document-topics ...')
    top_dist_arrays[i] = utils.getDocTopicDist(lda_models[i], corpus)
    lda_topics_file = os.path.join(RESULT_DIR, dataset_name + \
        '.' + str(N_TOPICS[i]) + '.topics.csv')
    np.savetxt(lda_topics_file, top_dist_arrays[i], delimiter=',')
    print('Saving LDA topics to ', lda_topics_file)
    print('Getting document-topics time: {}'.format((time.time() - t4)))

LDA training ...
Number of topic choices: 1
LDA training for 3 topics ...
LDA model training time for 3 topics: 123.52559566497803
Compute coherence value ...
Coherence value for 3 topics: -0.37462000138136603
Getting document-topics ...
Saving LDA topics to  ../data/output/ldabimetaout/S1.3.topics.csv
Getting document-topics time: 91.54539823532104


In [14]:
del lda_models
# Compute group features
print('The best number of topics for {}: {}'.format(dataset_name, \
    N_TOPICS[max_index]))
print('Max coherence score for {}: {}'.format(dataset_name, \
    max_coherence))


The best number of topics for S1: 3
Max coherence score for S1: -0.37462000138136603


In [15]:
t5 = time.time()
print('Compute LDA feature ...')
ngroups = len(groups)
temp_group_features = [top_dist_arrays[max_index].iloc[groups[i], :].mean() for i in range(ngroups)]
del top_dist_arrays
lda_group_features = np.array(temp_group_features)
del temp_group_features
print('Compute LDA feature time: ', (time.time() - t5))


Compute LDA feature ...
Compute LDA feature time:  0.13124465942382812


In [16]:
# Clustering groups
t6 = time.time()
print('Clustering ...')
kmeans = KMeans(
    init="random",
    n_clusters=n_clusters,
    n_init=100,
    max_iter=200,
    random_state=20210905)
kmeans.fit(X=lda_group_features, y=labels)
y_pred_kmeans = kmeans.predict(X=lda_group_features)
print('Clustering time: ', (time.time() - t6))

Clustering ...
Clustering time:  0.1956636905670166


In [17]:
# Map read to group and compute F-measure
t7 = time.time()
print('Compute F-measure ...')
groupPrec = group_precision_recall(labels, groups, n_clusters)[0]
f1 = genome_acc(groups, y_pred_kmeans, labels, n_clusters)[2]
print('Group precision: ', groupPrec)
print('F1-score: ', f1)
print('Total time: ', (time.time() - t0))

Compute F-measure ...
Group precision:  0.9898824286322081
F1-score:  0.700545087530471
Total time:  674.6191530227661


In [None]:
doc_topics = [[(0, 0.8), (1, 0.2)],[(0, 0.7),(1, 0.3)],[(0, 0.6),(1, 0.4)]]

In [None]:
doc_topics_list = pd.DataFrame((score for (id, score) in doc_topic) for doc_topic in doc_topics)

In [None]:
doc_topics_list

Unnamed: 0,0,1
0,0.8,0.2
1,0.7,0.3
2,0.6,0.4
