# Tutorial 9 (Enhanced): Multi-Modal — Cross-Modal (Text↔Image, Tensor→Text)

We insert text/image/tensor items, build relationships, and run cross-modal searches including tensor→text demo.

In [None]:
# Install
import sys, subprocess, pkgutil
for p in ['numpy','torch','matplotlib','seaborn','requests','pillow']:
    if pkgutil.find_loader(p) is None: subprocess.check_call([sys.executable,'-m','pip','install',p])
print('✅ Dependencies ready')

In [None]:
from tutorial_utils import (
    ping_server,
    ensure_dataset,
    ingest_tensor,
    fetch_dataset,
    summarize_records,
    tensor_addition,
    pretty_json,
)
API = "http://127.0.0.1:7860"
SERVER = ping_server(API)
print(f"📡 Tensorus server available: {SERVER}")

In [None]:
# Setup
import numpy as np, torch, requests
from dataclasses import dataclass, field
from typing import Dict, Any, List
from PIL import Image
import matplotlib.pyplot as plt, seaborn as sns
sns.set_theme(style='whitegrid')
def embed_text(text:str):
    rng=np.random.default_rng(abs(hash(text))%(2**32)); return torch.tensor(rng.normal(size=512), dtype=torch.float32)
def embed_image(arr:np.ndarray):
    v=torch.tensor(arr.mean(axis=(0,1)), dtype=torch.float32);
    return torch.cat([v, torch.zeros(512-v.numel())]) if v.numel()<512 else v
def embed_tensor(t:torch.Tensor):
    v=t.flatten(); v=v[:512] if v.numel()>512 else torch.cat([v, torch.zeros(512-v.numel())]); return v.float()
def cosine(a:torch.Tensor,b:torch.Tensor): a=a.float(); b=b.float(); return float(torch.dot(a,b)/(torch.norm(a)*torch.norm(b)+1e-8))
@dataclass
class MMItem: data_id:str; modality:str; content:Any; embedding:torch.Tensor; meta:Dict[str,Any]=field(default_factory=dict)
store: Dict[str,MMItem]={}; by_mod={'text':[], 'image':[], 'tensor':[]}
def random_image(w=128,h=128): return (np.random.rand(h,w,3)*255).astype('uint8')

## Insert Data

In [None]:
texts=[('doc1','text','CNNs for images',{'domain':'vision'}), ('doc2','text','Speech models with attention',{'domain':'audio'})]
for pid,mod,txt,meta in texts:
    emb=embed_text(txt); item=MMItem(pid,mod,txt,emb,meta); store[pid]=item; by_mod[mod].append(pid)
imgs=[('img1','image', random_image(), {'domain':'vision'}), ('img2','image', random_image(), {'domain':'medical'})]
for pid,mod,arr,meta in imgs:
    emb=embed_image(arr); item=MMItem(pid,mod,arr,emb,meta); store[pid]=item; by_mod[mod].append(pid)
tensors=[('ten1','tensor', torch.randn(64,64), {'domain':'vision'}), ('ten2','tensor', torch.randn(32,128), {'domain':'nlp'})]
for pid,mod,t,meta in tensors:
    emb=embed_tensor(t); item=MMItem(pid,mod,t,emb,meta); store[pid]=item; by_mod[mod].append(pid)
{k:len(v) for k,v in by_mod.items()}

## Relationships

In [None]:
store['doc1'].meta['related']=['img1','ten1']; store['img1'].meta['related']=['doc1']; store['ten1'].meta['related']=['doc1']; 'ok'

## Text → Images

In [None]:
q='convolutional networks for images'
qe=embed_text(q); cand=[store[i] for i in by_mod['image']]
sc=sorted([(c, cosine(qe,c.embedding)) for c in cand], key=lambda x:x[1], reverse=True)[:3]
for it,score in sc: print(it.data_id, round(score,3), it.meta.get('domain'))
plt.figure(); plt.imshow(sc[0][0].content); plt.title('Best match: '+sc[0][0].data_id); plt.axis('off'); plt.show()

## Tensor → Text

In [None]:
qe=store['ten1'].embedding; cand=[store[i] for i in by_mod['text']]
sc=sorted([(c, cosine(qe,c.embedding)) for c in cand], key=lambda x:x[1], reverse=True)
[(it.data_id, round(s,3)) for it,s in sc]