# Tutorial 7: Performance — Benchmarks and Analysis

Benchmark core tensor operations and storage systems. Runs with live Tensorus API when available, otherwise simulates results.

In [None]:
# Lightweight install cell
import sys, subprocess, pkgutil
for p in ['numpy','torch','matplotlib','seaborn','requests','psutil']:
    if pkgutil.find_loader(p) is None:
        subprocess.check_call([sys.executable,'-m','pip','install',p])
print('✅ Dependencies ready')

In [None]:
# Setup
import time, json, requests, numpy as np, torch, psutil, gc
import matplotlib.pyplot as plt, seaborn as sns
sns.set_theme(style='whitegrid')
API='http://127.0.0.1:7860'
def server_ok():
    try: return requests.get(f'{API}/health', timeout=2).status_code==200
    except: return False
SERVER=server_ok(); print('📡 Tensorus:', '✅ Connected' if SERVER else '⚠️ Demo Mode')

def timeit(fn):
    t0=time.perf_counter(); fn(); dt=time.perf_counter()-t0; return dt


## Step 1 — Tensor Operation Benchmarks (CPU/GPU)

In [None]:
sizes=[(1024,1024),(2048,2048),(4096,)]
ops=['creation','matmul','elementwise','reduction']
results=[]
def bench_creation(sz, device):
    return torch.randn(*sz, device=device)
def bench_matmul(sz, device):
    if len(sz)<2: a=torch.randn(sz[0], sz[0], device=device); b=torch.randn(sz[0], sz[0], device=device)
    else: a=torch.randn(*sz, device=device); b=torch.randn(sz[-1], sz[-2], device=device)
    return a@b
def bench_elementwise(sz, device):
    a=torch.randn(*sz, device=device); b=torch.randn(*sz, device=device); return a*b + torch.sin(a) - torch.cos(b)
def bench_reduction(sz, device):
    x=torch.randn(*sz, device=device); return x.sum()+x.mean()+x.std()
dispatch={'creation':bench_creation,'matmul':bench_matmul,'elementwise':bench_elementwise,'reduction':bench_reduction}
for sz in sizes:
    for op in ops:
        # CPU
        gc.collect(); dt=timeit(lambda: dispatch[op](sz, 'cpu'))
        results.append({'size':sz,'op':op,'system':'CPU','ms':dt*1000})
        if torch.cuda.is_available():
            torch.cuda.empty_cache(); dt=timeit(lambda: dispatch[op](sz, 'cuda'))
            torch.cuda.synchronize()
            results.append({'size':sz,'op':op,'system':'GPU','ms':dt*1000})
print('Samples:', len(results))
# Plot
import pandas as pd
df=pd.DataFrame(results)
plt.figure(figsize=(10,5));
for op in ops:
    sub=df[df.op==op]
    for sys in sub.system.unique():
        ssub=sub[sub.system==sys]
        plt.plot(range(len(ssub)), ssub.ms, '-o', label=f'{op}-{sys}')
plt.legend(); plt.ylabel('time (ms)'); plt.title('Operation Benchmarks (ordered by test)'); plt.show()

## Step 2 — Storage Benchmarks (Tensorus vs File vs Memory)

In [None]:
def bench_tensorus(tensor, n=50):
    times=[]
    try:
        r=requests.post(f'{API}/api/v1/tensors', json={'tensor_data': tensor.tolist()}, timeout=5)
        tid=r.json().get('tensor_id')
        if not tid: return []
        for _ in range(n):
            t0=time.perf_counter(); rr=requests.get(f'{API}/api/v1/tensors/{tid}', timeout=5);
            if rr.status_code==200: times.append(time.perf_counter()-t0)
        requests.delete(f'{API}/api/v1/tensors/{tid}', timeout=5)
    except Exception as e:
        return []
    return times

def bench_file(tensor, n=50):
    import tempfile, os
    times=[]
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pt') as f:
        path=f.name
    torch.save(tensor, path)
    for _ in range(n):
        t0=time.perf_counter(); _=torch.load(path); times.append(time.perf_counter()-t0)
    os.unlink(path)
    return times

def bench_memory(tensor, n=50):
    t=tensor.clone(); times=[]
    for _ in range(n):
        t0=time.perf_counter(); _=t.clone(); times.append(time.perf_counter()-t0)
    return times

tensor=torch.randn(1024,1024)
data_size=tensor.numel()*4
file_times=bench_file(tensor, n=40)
mem_times=bench_memory(tensor, n=40)
tus_times=bench_tensorus(tensor, n=40) if SERVER else []
print('Ops collected: file',len(file_times),'mem',len(mem_times),'tensorus',len(tus_times))
def ms(x): return np.mean(x)*1000 if x else None
print('Mean ms — File:',ms(file_times),'Memory:',ms(mem_times),'Tensorus:',ms(tus_times))
# Bar chart
labels=['File','Memory'] + (['Tensorus'] if tus_times else [])
means=[ms(file_times), ms(mem_times)] + ([ms(tus_times)] if tus_times else [])
plt.figure(); plt.bar(labels, means, color=['gray','orange','tab:blue'][:len(labels)]); plt.ylabel('ms'); plt.title('Storage Retrieval Mean Latency'); plt.show()

## Step 3 — Summary and Takeaways

- CPU vs GPU differences are highlighted across operations.
- File vs Memory vs Tensorus (when connected) showcase storage tradeoffs.
- Use these patterns to size workloads and choose the right execution mode.