# Tutorial 10 (Enhanced): Production — HA/DR, Monitoring, SLO Alerts

Simulate deployment, high availability (HA) failover, disaster recovery (DR), short monitoring run with SLO-based alerts, and status report. Works without a live API.

In [None]:
# Install
import sys, subprocess, pkgutil
for p in ['numpy','matplotlib','seaborn','requests','psutil']:
    if pkgutil.find_loader(p) is None: subprocess.check_call([sys.executable,'-m','pip','install',p])
print('✅ Dependencies ready')

In [None]:
from tutorial_utils import (
    ping_server,
    ensure_dataset,
    ingest_tensor,
    fetch_dataset,
    summarize_records,
    tensor_addition,
    pretty_json,
)
API = "http://127.0.0.1:7860"
SERVER = ping_server(API)
print(f"📡 Tensorus server available: {SERVER}")

In [None]:
# Setup
import time, json, threading, requests, psutil, numpy as np
from dataclasses import dataclass, field
from enum import Enum
from datetime import datetime
import matplotlib.pyplot as plt, seaborn as sns
sns.set_theme(style='whitegrid')
class Env(Enum): DEV='development'; STAGE='staging'; PROD='production'
class Sev(Enum): INFO='info'; WARN='warning'; ERR='error'; CRIT='critical'
@dataclass
class Metrics: ts:datetime; cpu:float; mem:float; disk:float; rps:float; err:float; rt_ms:float
class Monitor:
    def __init__(self): self.hist=[]; self.alerts=[]; self.thres={'cpu':80.0,'mem':85.0,'err':5.0,'rt':800.0}
    def collect(self):
        m=Metrics(datetime.now(), psutil.cpu_percent(0.1), psutil.virtual_memory().percent, psutil.disk_usage('/').percent, np.random.uniform(80,200), np.random.uniform(0,6), np.random.uniform(40,1200)); self.hist.append(m); self._alerts(m); return m
    def _alerts(self,m):
        def add(sev,msg): self.alerts.append({'ts':datetime.now(),'sev':sev.value,'msg':msg})
        if m.cpu>self.thres['cpu']: add(Sev.WARN,'High CPU')
        if m.mem>self.thres['mem']: add(Sev.WARN,'High MEM')
        if m.err>self.thres['err']: add(Sev.ERR,'High error rate')
        if m.rt_ms>self.thres['rt']: add(Sev.WARN,'Slow response')
    def run(self, samples=6, interval=1.0):
        for _ in range(samples): self.collect(); time.sleep(interval)
    def report(self):
        if not self.hist: return {'status':'no_data'}
        avg=lambda k: float(np.mean([getattr(x,k) for x in self.hist[-min(10,len(self.hist)):]]))
        status='healthy'; cpu,mem,err=avg('cpu'),avg('mem'),avg('err')
        if cpu>90 or mem>95 or err>10: status='critical'
        elif cpu>80 or mem>85 or err>5: status='warning'
        return {'status':status,'avg':{'cpu':cpu,'mem':mem,'err':err}}
class Deployment:
    def __init__(self, env=Env.PROD): self.env=env; self.active_region='us-central'; self.standby_region='us-east'
    def deploy(self): steps=['Validate','Provision','Deploy','LB','Monitoring','Health']; [print(str(i+1)+'/'+str(len(steps)),s+'...') for i,s in enumerate(steps)]; time.sleep(0.4); print('✅ Deployed'); return {'env':self.env.value,'region':self.active_region}
    def failover(self): print('⚠️ Failover from', self.active_region,'to', self.standby_region); self.active_region, self.standby_region=self.standby_region, self.active_region; return {'active_region':self.active_region}
    def dr_restore(self): print('🛠️ DR restore into', self.standby_region); time.sleep(0.3); return {'dr':'ok'}


## Deploy & Monitor

In [None]:
dep=Deployment(); info=dep.deploy(); mon=Monitor(); mon.run(samples=6, interval=1.0); rep=mon.report(); info, rep

## Visualize CPU/MEM and Alerts

In [None]:
cpus=[m.cpu for m in mon.hist]; mems=[m.mem for m in mon.hist]
plt.figure(figsize=(8,3)); plt.plot(cpus,'-o',label='CPU'); plt.plot(mems,'-o',label='MEM'); plt.legend(); plt.title('System Metrics'); plt.show()
mon.alerts[-5:]

## HA/DR Simulation

In [None]:
dep.failover(); dep.dr_restore(); dep.deploy()