# **Memory Profiling Analysis**
Using the *memory_profiler* pip package, we analyse ClusTCR's memory usage against iSMART, GLIPH2 and tcrdist3.


In [34]:
%matplotlib inline
import matplotlib.pyplot as plt
from memory_profiler import memory_usage

from clustcr.clustering.clustering import Clustering
from clustcr.input.datasets import metarepertoire
from clustcr.modules.gliph2.gliph2 import GLIPH2
from clustcr.modules.ismart.ismart import iSMART
from clustcr.modules.tcrdist.pw_tcrdist import tcrdist


Set the path to a folder containing immuneaccess data files (such as Emerson et al)

In [35]:
IMMUNEACCESS = '/home/max/Documents/bio/emerson-2017-natgen'


We define some helper functions to profile the memory usage and retrieve data from the above folder.

In [36]:
def memory(f, *args):
    return memory_usage((f, args, {}), include_children=True,  max_usage=True)


def meta(n, out='CDR3'):
    return metarepertoire(directory=IMMUNEACCESS,
                          data_format='immuneaccess',
                          out_format=out,
                          n_sequences=n)

In [37]:
sample_sizes = [2000, 4000, 8000, 20000, 60000]
result = {
    'GLIPH2': [],
    'iSMART': [],
    'clusTCR': [],
    'clusTCR (n_cpus=1)': [],
    'tcrdist3*': []
}

for size in sample_sizes:
    cdr3 = meta(size)
    cdr3_gliph2 = meta(size, 'GLIPH2')
    cdr3_tcrdist = meta(size, 'TCRDIST')

    result['tcrdist3*'].append(memory(tcrdist, cdr3_tcrdist) if size < 10000 else None)
    result['GLIPH2'].append(memory(GLIPH2, cdr3_gliph2))
    result['iSMART'].append(memory(iSMART, cdr3))
    result['clusTCR'].append(memory(lambda: Clustering(n_cpus='all').fit(cdr3)))
    result['clusTCR (n_cpus=1)'].append(memory(lambda: Clustering().fit(cdr3)))

    print(size, result)

  cdr3 = meta(size)
  cdr3 = meta(size)
  cdr3_gliph2 = meta(size, 'GLIPH2')
  cdr3_gliph2 = meta(size, 'GLIPH2')
  cdr3_tcrdist = meta(size, 'TCRDIST')
  cdr3_tcrdist = meta(size, 'TCRDIST')
  self.deduplicate()
  cdr3 = meta(size)


pw dist calculations for 2000 sequences with tcrdist.
CREATED /a4984d985d93/ FOR HOLDING DISTANCE OUT OF MEMORY
RETURNING scipy.sparse csr_matrix w/dims (2000, 2000)
CLEANING UP a4984d985d93
Elapsed time: 4.324952602386475 seconds.
2000 {'GLIPH2': [1941.4296875], 'iSMART': [1921.86328125], 'clusTCR': [9338.97265625], 'clusTCR (n_cpus=1)': [1873.72265625], 'tcrdist3*': [2852.0859375]}


KeyboardInterrupt: 

## **Plot results**

In [None]:
plt.figure()
for name, values in result.items():
    print(name, values)
    plt.plot(sample_sizes, values, label=name)
plt.xlabel('Number of sequences')
plt.ylabel('MB used')
plt.legend(loc='best')
plt.show()
plt.savefig('memory.png')