## measuring memory latency 

In this notebook we will investigate the distribution of latency times for different size arrays.

In [None]:
pwd, = !pwd
pwd

In [30]:
%%writefile random_poke.py
from numpy import *
import time
from os.path import isfile,isdir
from os import mkdir,chdir
import os

from lib.measureRandomAccess import measureRandomAccess
#from lib.create_file import create_file,tee

## Remember the path for home and  log directories
home_base='/tmp/'
log_root=home_base+'logs/'
if not isdir(log_root):
    mkdir(log_root)

m_list=[int(10**i) for i in [1,3,6]]
m_legend=['10B','1KB','1MB','1GB','10GB']
L=len(m_list)
k=10000 # number of pokes
print('m_list=',m_list)

TimeStamp=str(int(time.time()))
log_dir=log_root+'/'+TimeStamp  # timestamp is used so that multiple runs can be done on the same computer.
mkdir(log_dir)
chdir(log_dir)

_mean=zeros([2,L])   #0: using disk, 1: using memory
_std=zeros([2,L])
_block_no=zeros([L])
_block_size=zeros([L])
T=zeros([2,L,k])
    
def create_file(n,m,filename='DataBlock'):
    """Create a scratch file of a given size

    :param n: size of block
    :param m: number of blocks
    :param filename: desired filename
    :returns: time to allocate block of size n, time to write a file of size m*n
    :rtype: tuple

    """
    t1=time.time()
    A=bytearray(n)
    t2=time.time()
    file=open(filename,'wb')
    for i in range(m):
        file.write(A)
        if i % 100 == 0:
            print('\r',i,",", end=' ')
    file.close()
    t3=time.time()
    tee('\r              \ncreating %d byte block: %f sec, writing %d blocks %f sec' % (n,t2-t1,m,t3-t2))
    return (t2-t1,t3-t2)

Random_pokes=[]
Min_Block_size=1000000
for m_i in range(len(m_list)):
    
    m=m_list[m_i]
    blocks=int(m/Min_Block_size)
    if blocks==0:
        _block_size[m_i]=1
        _block_no[m_i]=m
    else:
        _block_size[m_i]=Min_Block_size
        _block_no[m_i]=blocks
    (t_mem,t_disk) = create_file(int(_block_size[m_i]),int(_block_no[m_i]),filename='BlockData'+str(m))

    (_mean[0,m_i],_std[0,m_i],T[0,m_i]) = measureRandomAccess(m,filename='BlockData'+str(m),k=k)
    T[0,m_i]=sorted(T[0,m_i])
    print('\rFile pokes _mean='+str(_mean[0,m_i])+', file _std='+str(_std[0,m_i]))

    (_mean[1,m_i],_std[1,m_i],T[1,m_i]) = measureRandomAccess(m,filename='',k=k)
    T[1,m_i]=sorted(T[1,m_i])
    print('\rMemory pokes _mean='+str(_mean[1,m_i])+', Memory _std='+str(_std[1,m_i]))
    
    Random_pokes.append({'m_i':m_i,
                        'm':m,
                        'memory__mean': _mean[1,m_i],
                        'memory__std': _std[1,m_i],
                        'memory_largest': T[1,m_i][-1000:],
                        'file__mean': _mean[0,m_i],
                        'file__std': _std[0,m_i],
                        'file_largest': T[0,m_i][-1000:]                
                })
print('='*50)

stats = {'Random_pokes':Random_pokes}
import pickle as pk
with open('stats.pkl','wb') as stats:
    pk.dump(Random_pokes,stats,protocol=pk.HIGHEST_PROTOCOL)


import time
import numpy as np
Consec=[]
Line='### Consecutive Memory writes:'
print(Line); 
n=1000
r=np.array(list(range(n)))
Header="""
|   size (MB) | Average time per byte |
| ---------: | --------------: | """
print(Header)
for m in [1,1000,1000000]:
    #print(m,end='')
    t1=time.time()
    A=np.repeat(r,m)
    t2=time.time()
    Consec.append((n,m,float(n*m)/1000000,(t2-t1)/float(n*m)))
    print("| %6.3f | %4.2g |" % (float(n*m)/1000000,(t2-t1)/float(n*m)))
A=[];r=[]


#read files and sum entries
%cd /tmp/logs/
!ls -lrt
%cd 1584056487

files=!ls Block*
files

from time import time
for file in files:
    t0 = time()
    M=np.fromfile(file,dtype=np.byte,count=-1)
    t1=time()
    np.sum(M)
    t2=time()
    read_time= t1-t0
    calc_time= t2-t1
    L=max(1,M.shape[0])
    print('%20s %10d read time=%5.3f per byte=%3.1g\t sum time=%5.3f per byte=%3.1g'%(file,L,read_time,read_time/L,calc_time,calc_time/L))


Writing random_poke.py


### Specify which OS you are using
Uncomment the line corresponding to your OS. Comment all of the rest.

In [12]:
brand_name = "brand: Macbook"
#brand_name = "brand: Linux"
#brand_name = "brand: Windows"

### For Mac users

The next cell needs to be run only by Mac OS users. If run on other OS platforms, it will throw error. 

In [13]:
if brand_name== "brand: Macbook":
    # To get all available information use !sysctl -a
    os_info = !sysctl kernel.osrelease kernel.osrevision kernel.ostype kernel.osversion
    cpu_info = !sysctl machdep.cpu.brand_string machdep.cpu.cache.L2_associativity machdep.cpu.cache.linesize machdep.cpu.cache.size machdep.cpu.core_count
    cache_info = !sysctl kern.procname hw.memsize hw.cpufamily hw.activecpu hw.cachelinesize hw.cpufrequency hw.l1dcachesize hw.l1icachesize hw.l2cachesize hw.l3cachesize hw.cputype 

In [15]:
cpu_info

['machdep.cpu.brand_string: Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz',
 'machdep.cpu.cache.L2_associativity: 8',
 'machdep.cpu.cache.linesize: 64',
 'machdep.cpu.cache.size: 256',
 'machdep.cpu.core_count: 4']

### For Linux OS users

In [17]:
import os
def run(command):
    stream = os.popen(command)
    output = stream.read()
    return output

### Pricing

https://aws.amazon.com/ec2/pricing/on-demand/

https://aws.amazon.com/ec2/spot/pricing/

In [20]:
commands=[
    ("os_info","sysctl kernel.ostype kernel.osrelease"),
    ("os_version","lsb_release -r"),
    ("memory_size","cat /proc/meminfo | grep 'MemTotal'"),

    cache_L1i = !lscpu | grep 'L1i'
    cache_L1d = !lscpu | grep 'L1d'
    cache_L2 = !lscpu | grep 'L2'
    cache_L3 = !lscpu | grep 'L3'
    cache_info = cache_L1i + cache_L1d + cache_L2 + cache_L3

    cpu_type = !lscpu | grep 'CPU family'
    cpu_brand = !cat /proc/cpuinfo | grep -m 1 'model name'
    cpu_frequency = !lscpu | grep 'CPU MHz'
    cpu_core_count = !lscpu | grep 'CPU(s)'
    cpu_info = cpu_type + cpu_brand + cpu_frequency + cpu_core_count
"""



machdep.cpu.brand_string: Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
machdep.cpu.cache.L2_associativity: 8
machdep.cpu.cache.linesize: 64
machdep.cpu.cache.size: 256
machdep.cpu.core_count: 4



In [None]:
if brand_name == "brand: Linux":
    os_info = !sysctl kernel.ostype kernel.osrelease 
    os_version = !lsb_release -r
    memory_size = !cat /proc/meminfo | grep 'MemTotal'
    os_info += os_version + memory_size

    cache_L1i = !lscpu | grep 'L1i'
    cache_L1d = !lscpu | grep 'L1d'
    cache_L2 = !lscpu | grep 'L2'
    cache_L3 = !lscpu | grep 'L3'
    cache_info = cache_L1i + cache_L1d + cache_L2 + cache_L3

    cpu_type = !lscpu | grep 'CPU family'
    cpu_brand = !cat /proc/cpuinfo | grep -m 1 'model name'
    cpu_frequency = !lscpu | grep 'CPU MHz'
    cpu_core_count = !lscpu | grep 'CPU(s)'
    cpu_info = cpu_type + cpu_brand + cpu_frequency + cpu_core_count

###  For Windows users

In [None]:
if brand_name =="brand: Windows":
    os_release  = !ver
    os_type     = !WMIC CPU get  SystemCreationClassName
    memory      = !WMIC ComputerSystem get TotalPhysicalMemory
    os_info     = os_release + os_type

    cpu_core_count  = !WMIC CPU get NumberOfCores
    cpu_speed       = !WMIC CPU get CurrentClockSpeed
    cpu_model_name  = !WMIC CPU get name
    cpu_info        = cpu_core_count + cpu_speed + cpu_model_name

    l2cachesize = !WMIC CPU get L2CacheSize
    l3cachesize = !WMIC CPU get L3CacheSize
    cache_info  = l2cachesize + l3cachesize

In [None]:
# Print collected information
description=[brand_name] + os_info + cache_info + cpu_info
print("Main Harware Parameters:\n")
print('\n'.join(description))

### Summary of  Macbook Pro hardware parameters
*   Intel four cores
* Clock Rate: `2.50GHz` (0.4ns per clock cycle)
<img alt="" src="Figures/MacBookProMemorySizes.png" style="width:500px" />

In [None]:
# Writing all necesarry information int oa pickle file.
import pickle
with open(home_base+'/memory_report.pkl','wb') as pickle_file:
    pickle.dump({'description':description,
                'Consec':Consec,
                'Random_pokes':Random_pokes},
               pickle_file)

## Observations

* making measurements in the wild allows you to measure the performance of your hardware with your software.
* Measuring in the wild you discover unexpected glitches: 
  * timer resolution is 1$\mu$sec
  * once every ~10,000 of a zero-time poke there is a 10^{-5}$ delay. Maybe a context switch?

* Latencies typically have long tails - Use loglog graphs.

* Memory latency varies from $10^{-9}$sec to $10^{-6}$sec depending on access pattern.

* SSD latency for random access varies from $10^{-5}$sec to $10^{-1}$sec.

* When reading or writing large blocks, we care about **throughput** or **byte-rate** not **latency**

* Typical throughputs:  **Memory:** 100MB/sec   **SSD:** 1GB/sec   **Disk:** (fill in)

### Impact on Big Data Analytics

* Clock rate is stuck at around 3GHz, and is likely to be stuck there for the forseeable future.

* **Faster** computers / disks / networks are **expensive** 

* **focus on data access: ** The main bottleneck on big data computation is moving data around, **NOT** calculation.

* The cost-effective solution is often a cluster of many cheap computers, each with many cores and break up the data so that each computer has a small fraction of the data.

* Data-Centers and the "Cloud"

* I invite you to use this notebook on your computer to get a better understanding of its memory access latency.

* If you are interest in way to make more accurate measurements of latency, try notebook 3.

* See you next time.

## Clean-Up
This notebook generates large logs that can be deleted.

A summary of the results is placed in the file `memory_report.pkl`

In [None]:
%cd $home_base

!rm -rf logs