In [1]:
"""Timer to compare runtimes."""
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [2]:
"""Helper functions for managing NumPy / Pandas inputs."""
import pandas as pd
import numpy as np
import os

def load_numpy(filename):
    """Loads NumPy embeddings."""
    assert os.path.exists(file) and file.endswith(".npy"), "File doesnt exist."
    return np.load(file)

def load_pandas(filename):
    """Loads NumPy embeddings. Converts to Pandas."""
    data = load_numpy(file)
    return pd.DataFrame({'ft%d'%i: X[i] for i in range(data.shape[0])})

def pd_to_np(x):
    """Converts Pandas to NumPy."""
    if isinstance(x, pd.DataFrame):
        return np.array(x)
    print("Warning: Input is not a Pandas DataFrame.")
    return x

def np_to_pd(x):
    """Converts NumPy to Pandas."""
    if isinstance(x, np.ndarray) or isinstance(x, np.array):
        return pd.DataFrame({'ft%d'%i: X[i] for i in range(x.shape[0])})
    print("Warning: Input is not a NumPy array.")
    return x

In [3]:
"""SKLearn KMeans helper function."""
%%time
from sklearn.cluster import KMeans as skKM

def sklearn_kmeans(embeddings, params):
    """Computes SKLearn KMeans."""
    # cluster with sklearn kmeans
    km = skKM(n_clusters=params['n_clusters'],
              n_init=parms['n_init'],
              max_iter=params['max_iter'],
              tol=params['tol'],
              precompute_distances=params['precompute_distances'],
              verbose=params['verbose'],
              random_state=params['random_state'],
              copy_x=params['copy_x'],
              n_jobs=params['n_jobs'],
              algorithm=params['algorithm'])
    km.fit(embeddings)
    return (km.labels_, km.inertia_)

In [4]:
"""cuML KMeans helper function."""
%%time
import cuml.KMeans as cuKM
import cudf

def cuml_kmeans(df, params):
    """Computes cuML KMeans."""
    # convert pandas input to cudf
    embeddings = cudf.from_pandas(df)
    
    # cluster with cuml kmeans
    km = cuKM(n_clusters=params['n_clusters'],
              max_iter=params['max_iter'],
              tol=params['tol'],
              verbose=params['verbose'],
              random_state=params['random_state'],
              precompute_distances=params['precompute_distances'],
              init=params['init'],
              n_init=params['n_init'],
              algorithm=params['algorithm'],
              n_gpus=params['n_gpus'],
              gpu_id=params['gpu_id'])
    km.fit(df)
    return (km.labels_, km.inertia_)

ImportError: No module named 'cuml'

In [None]:
"""KMeans wrapper for SKLearn / cuML KMeans."""
from sklearn.metrics import silhouette_score

def kmeans_wrapper(params):
    """Computes inertia and silhouette scores. Writes clusters to files."""
    # ensure each embedding corresponds to a sentence
    embeddings = load_numpy(params['emb_file'])
    sentences = open(params['sent_file'], 'r').read().splitlines()
    assert embeddings.shape[0] == len(sentences), "Count of sentences and embeddings must match."
    
    # compute sklearn kmeans
    print("Computing skLearn KMeans...")
    sk_labels, sk_inertia = sklearn_kmeans(embeddings, params['sklearn_kmeans'])
    sk_sil = silhouette_score(embeddings, sk_labels)
    print("Inertia: %.6f \n 'Silhouette Score: %.6f\n" \
          %(sk_inertia, sk_sil))
    
    # compute cuml kmeans
    print("Computing cuML KMeans...")
    cu_labels, cu_inertia = cuml_kmeans(np_to_pd(embeddings), params['cuml_kmeans'])
    cu_sil = silhouette_score(embeddings, cu_labels)
    print("Inertia: %.6f \n 'Silhouette Score: %.6f\n" \
          %(cu_inertia, cu_sil))
    
    # check for valid output directory
    out_dir = params['out_dir']
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    elif len(os.listdir(out_dir)) > 0:
        print("Warning: Non-empty directory. Outputs may be corrupted.")
        
    # write clustered sentences to output
    for i in range(embeddings.shape[0]):
        l = labels[i]
        sk_file = 'sk' + str(l) + '.txt'
        cu_file = 'cu' + str(l) + '.txt'
        with open(os.path.join(out_dir, sk_file), 'a') as fs:
            fs.write(sentences[i] + '\n')
        with open(os.path.join(out_dir, cu_file), 'a') as fc:
            fc.write(sentences[i] + '\n')

In [None]:
"""Handles user input for files."""
import argparse

def main(params):
    """Parses command-line arguments."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', nargs='?', type=str, default=None, help='Output directory for KMeans.')
    parser.add_argument('--emb_file', nargs='?', type=str, default=None, help='.npy file of embeddings.')
    parser.add_argument('--sent_file', nargs='?', type=str, default=None, help='.txt file of sentences.')
    args = parser.parse_args()
    
    if args.out_dir == None: params['out_dir'] = os.path.join(os.getcwd(), 'rapids')
    else: params['out_dir'] = args.out_dir

    if args.emb_file == None: params['emb_file'] = os.path.join(os.getcwd(), 'data', 'medica-s.npy')
    else: params['emb_file'] = args.emb_file

    if args.sent_file == None: params['sent_file'] = os.path.join(os.getcwd(), 'data', 'medica-s.txt')
    else: params['sent_file'] = args.sent_file
        
    kmeans_wrapper(params)

In [None]:
"""Parameters for KMeans clustering."""
# default kmeans parameters
params = {
    'sklearn_kmeans': {
        'n_clusters': 8,
        'init': 'kmeans++',
        'n_init': 10,
        'max_iter': 300,
        'tol': 1e-4,
        'precompute_distances': 'auto',
        'verbose': 0,
        'random_state': None,
        'copy_x': True,
        'n_jobs': None,
        'algorithm': 'auto'
    },
    
    'cuml_kmeans': {
        'n_clusters': 8,
        'max_iter': 300,
        'tol': 1e-4,
        'verbose': 0,
        'random_state': 1,
        'precompute_distances': 'auto',
        'init': 'kmeans++',
        'n_init': 1,
        'algorithm': 'auto',
        'n_gpus': 1,
        'gpu_id': 0
    }
}

# run everything!
if __name__ == '__main__':
    main(params)