# Set up environment

In [2]:
!pip install biopython
!git clone https://github.com/networkx/networkx-metis.git &> /dev/null
%cd networkx-metis
!python setup.py build &> /dev/null
!python setup.py install &> /dev/null

/content/networkx-metis


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import os
os.environ['PYTHONPATH'] = ('PYTHONPATH:/content/drive/MyDrive/metagenomic-binning/nlp-bimeta')
!export | grep PYTHONPATH

declare -x PYTHONPATH="PYTHONPATH:/content/drive/MyDrive/metagenomic-binning/nlp-bimeta"


In [4]:
import os
os.getcwd()

'/content/networkx-metis'

In [5]:
%cd ..
%cd drive/MyDrive/metagenomic-binning/nlp-bimeta

/content
/content/drive/MyDrive/metagenomic-binning/nlp-bimeta


# Begin bimeta

In [6]:
import glob, os, time, sys
import json
import numpy as np
from collections import defaultdict
import argparse

from dataset.genome import GenomeDataset
from utils.utils import load_genomics 
from debug.visualize import get_group_label, visualize
from utils.metrics import genome_acc, group_precision_recall

from sklearn.cluster import KMeans

In [7]:
sys.path.append('.')

DATASET_DIR = '../data/test/input/'    # Raw fasta data dir
DATASET_NAME = 'S1_test'   # Specifc fasta dataset or all of them
RESULT_DIR = '../data/test/output/bimetaout/'      # Result dir

In [8]:
# Hyperparameters
# Follows metaprob
KMERS = [4]
LMER = 30
NUM_SHARED_READS = (5, 45)
ONLY_SEED = True
MAXIMUM_SEED_SIZE = 9000

In [9]:
# Mapping of dataset and its corresponding number of clusters
with open('config/dataset_metadata.json', 'r') as f:
    n_clusters_mapping = json.load(f)['datasets']

In [10]:
# Get some parameters
dataset_file = os.path.join(DATASET_DIR, DATASET_NAME + '.fna')
dataset_name = os.path.basename(dataset_file).split('.fna')[0]

print("-------------------------------------------------------")
print('Processing dataset: ', dataset_name)

num_shared_read = NUM_SHARED_READS[1] if 'R' in dataset_name else NUM_SHARED_READS[0]
is_deserialize = os.path.exists(os.path.join(RESULT_DIR, dataset_name + '.json'))

-------------------------------------------------------
Processing dataset:  S1_test


In [11]:
n_clusters = n_clusters_mapping[dataset_name]
print('Prior number of clusters: ', n_clusters)

Prior number of clusters:  2


In [12]:
t0 = time.time()
try:
    seed_kmer_features, labels, groups, seeds = load_genomics(
        dataset_file,
        kmers=KMERS,
        lmer=LMER,
        maximum_seed_size=MAXIMUM_SEED_SIZE,
        num_shared_reads=num_shared_read,
        is_deserialize=is_deserialize,
        is_serialize=~is_deserialize,
        is_normalize=True,
        only_seed=ONLY_SEED,
        graph_file=os.path.join(RESULT_DIR, dataset_name + '.json')
    )
except:
    seed_kmer_features, labels, groups, seeds = load_genomics(
        dataset_file,
        kmers=KMERS,
        lmer=LMER,
        maximum_seed_size=MAXIMUM_SEED_SIZE,
        num_shared_reads=num_shared_read,
        is_deserialize=False,
        is_serialize=True,
        is_normalize=True,
        only_seed=ONLY_SEED,
        graph_file=os.path.join(RESULT_DIR, dataset_name + '.json')
    )
print('Total number of reads: ', len(labels))
print('Total number of groups: ', len(groups))
print('Bimeta phase 1 time: ', (time.time() - t0))

Serializing data to... ../data/test/output/bimetaout/S1_test.json
Total number of reads:  400
Total number of groups:  396
Bimeta phase 1 time:  0.9831626415252686


  result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)


In [13]:
t1 = time.time()
kmeans = KMeans(
    init="random",
    n_clusters=n_clusters,
    n_init=100,
    max_iter=200,
    random_state=20210903)
kmeans.fit(X=seed_kmer_features, y=labels)
y_pred_kmeans = kmeans.predict(X=seed_kmer_features)
#print('length of y_pred_kmeans: ', len(y_pred_kmeans))

groupPrec = group_precision_recall(labels, groups, n_clusters)[0]
f1 = genome_acc(groups, y_pred_kmeans, labels, n_clusters)[2]

print('Group precision: ', groupPrec)
print('F1-score: ', f1)
print('Clustering time: ', (time.time() - t1))
print('Total time: ', (time.time() - t0))

Group precision:  1.0
F1-score:  0.8275
Clustering time:  0.64451003074646
Total time:  26.905873775482178
