# Tutorial 7 (Enhanced): Performance — Latency and Throughput (Ops/sec)

We measure time and throughput for key operations on CPU/GPU and compare storage systems (File/Memory/Tensorus).

In [None]:
# Install
import sys, subprocess, pkgutil
for p in ['numpy','torch','matplotlib','seaborn','requests','pandas','psutil']:
    if pkgutil.find_loader(p) is None: subprocess.check_call([sys.executable,'-m','pip','install',p])
print('✅ Dependencies ready')

In [None]:
# Setup
import time, numpy as np, torch, requests, pandas as pd, psutil, gc
import matplotlib.pyplot as plt, seaborn as sns
sns.set_theme(style='whitegrid')
API='http://127.0.0.1:7860'
def server_ok():
    try: return requests.get(f'{API}/health', timeout=2).status_code==200
    except: return False
SERVER=server_ok(); print('📡 Tensorus:', '✅ Connected' if SERVER else '⚠️ Demo Mode')
def timeit(fn, n=1):
    t0=time.perf_counter()
    for _ in range(n): fn()
    dt=time.perf_counter()-t0
    return dt/n

## Operation Benchmarks and Throughput

In [None]:
sizes=[(1024,1024),(2048,2048),(4096,)]
ops=['creation','matmul','elementwise','reduction']
def bench(op, sz, device):
    def creation(): return torch.randn(*sz, device=device)
    def matmul():
        if len(sz)<2: a=torch.randn(sz[0],sz[0],device=device); b=torch.randn(sz[0],sz[0],device=device)
        else: a=torch.randn(*sz,device=device); b=torch.randn(sz[-1], sz[-2], device=device); return a@b
    def elementwise(): a=torch.randn(*sz,device=device); b=torch.randn(*sz,device=device); return a*b+torch.sin(a)-torch.cos(b)
    def reduction(): x=torch.randn(*sz,device=device); return x.sum()+x.mean()+x.std()
    f={'creation':creation,'matmul':matmul,'elementwise':elementwise,'reduction':reduction}[op]
    dt=timeit(f, n=3); th=1.0/dt if dt>0 else float('inf'); return dt*1000, th
rows=[]
for sz in sizes:
    for op in ops:
        dt,th=bench(op,sz,'cpu'); rows.append({'size':str(sz),'op':op,'system':'CPU','ms':dt,'ops_per_sec':th})
        if torch.cuda.is_available():
            torch.cuda.empty_cache(); dt,th=bench(op,sz,'cuda'); torch.cuda.synchronize(); rows.append({'size':str(sz),'op':op,'system':'GPU','ms':dt,'ops_per_sec':th})
df=pd.DataFrame(rows); df.head()

In [None]:
plt.figure(figsize=(10,4));
sns.barplot(data=df, x='op', y='ms', hue='system'); plt.title('Latency (ms) by Operation and System'); plt.show()
plt.figure(figsize=(10,4));
sns.barplot(data=df, x='op', y='ops_per_sec', hue='system'); plt.title('Throughput (ops/sec) by Operation and System'); plt.show()

## Storage Benchmarks (File/Memory/Tensorus)

In [None]:
def bench_tensorus(tensor, n=30):
    times=[]
    try:
        r=requests.post(f'{API}/api/v1/tensors', json={'tensor_data':tensor.tolist()}, timeout=5); tid=r.json().get('tensor_id')
        if not tid: return []
        for _ in range(n): t0=time.perf_counter(); rr=requests.get(f'{API}/api/v1/tensors/{tid}', timeout=5);
        if rr.status_code==200: times.append(time.perf_counter()-t0)
        requests.delete(f'{API}/api/v1/tensors/{tid}', timeout=5)
    except Exception: return []
    return times
def bench_file(tensor, n=30):
    import tempfile, os
    times=[]; path=tempfile.NamedTemporaryFile(delete=False, suffix='.pt').name
    torch.save(tensor, path)
    for _ in range(n): t0=time.perf_counter(); _=torch.load(path); times.append(time.perf_counter()-t0)
    os.unlink(path); return times
def bench_memory(tensor, n=30):
    t=tensor.clone(); times=[]
    for _ in range(n): t0=time.perf_counter(); _=t.clone(); times.append(time.perf_counter()-t0)
    return times
tensor=torch.randn(1024,1024)
file_t=bench_file(tensor); mem_t=bench_memory(tensor); tus_t=bench_tensorus(tensor) if SERVER else []
def ms(x): return np.mean(x)*1000 if x else None
print('Mean ms — File:',ms(file_t),'Memory:',ms(mem_t),'Tensorus:',ms(tus_t))