# **Memory Profiling ClusTCR and other TCR clustering methods**
This notebook compares the memory usage of *clusTCR* with existing TCR clustering methods.


In [35]:
%matplotlib inline
import matplotlib.pyplot as plt
from memory_profiler import memory_usage
from clustcr.clustering.clustering import Clustering
from clustcr.input.datasets import metarepertoire
from clustcr.modules.gliph2.gliph2 import GLIPH2
from clustcr.modules.ismart.ismart import iSMART
from clustcr.modules.tcrdist.pw_tcrdist import tcrdist

EMERSON = '/home/max/Documents/bio/emerson-2017-natgen'

def memory(f, *args):
    return memory_usage((f, args, {}), max_usage=True, include_children=True)


def meta(n, out='CDR3'):
    return metarepertoire(directory=EMERSON,
                          data_format='immuneaccess',
                          out_format=out,
                          n_sequences=n)


In [None]:
sample_sizes = [5000, 20000, 100000]
result = {
    'GLIPH2': [],
    'iSMART': [],
    'clusTCR': [],
    'tcrdist3*': []
}

for size in sample_sizes:
    cdr3 = meta(size)
    cdr3_gliph2 = meta(size, 'GLIPH2')
    cdr3_tcrdist = meta(size, 'TCRDIST')

    result['tcrdist3*'].append(memory(tcrdist, cdr3_tcrdist))
    result['GLIPH2'].append(memory(GLIPH2, cdr3_gliph2))
    result['iSMART'].append(memory(iSMART, cdr3))
    result['clusTCR'].append(memory(lambda: Clustering(n_cpus='all').fit(cdr3)))

    print(size, result)

  cdr3 = meta(size)
  cdr3_gliph2 = meta(size, 'GLIPH2')
  cdr3_tcrdist = meta(size, 'TCRDIST')
  cdr3_tcrdist = meta(size, 'TCRDIST')
  cdr3 = meta(size)
  cdr3 = meta(size)
  cdr3_gliph2 = meta(size, 'GLIPH2')
  cdr3_tcrdist = meta(size, 'TCRDIST')


pw dist calculations for 5000 sequences with tcrdist.
CREATED /90039445f09e/ FOR HOLDING DISTANCE OUT OF MEMORY
RETURNING scipy.sparse csr_matrix w/dims (4999, 4999)
CLEANING UP 90039445f09e
Elapsed time: 19.82661724090576 seconds.
Clustering 5000 sequences with GLIPH2.
Elapsed time: 4.890369653701782 seconds.
Clustering 5000 sequences with iSMART.
Elapsed time: 1.889692783355713 seconds.
5000 {'GLIPH2': [1652.63671875], 'iSMART': [1650.2890625], 'clusTCR': [7599.4765625], 'tcrdist3*': [2402.33984375]}
pw dist calculations for 20000 sequences with tcrdist.
CREATED /20c3cd515a49/ FOR HOLDING DISTANCE OUT OF MEMORY


## **Plot results**

In [None]:
plt.figure()
for name, values in result.items():
    plt.plot(sample_sizes, values, label=name)
plt.xlabel('Number of sequences')
plt.ylabel('MB used')
plt.legend(loc='best')
plt.show()