# Gif search with MiniLM-L6 and CLIP embeddings
using tumblr gif dataset https://github.com/raingo/TGIF-Release

In [None]:
!pip install requests indexify

In [None]:
from indexify import IndexifyClient, Document
client = IndexifyClient()

In [None]:
for e in client.extractors():
    print(e.name)

# Add policies

In [None]:
client.add_extraction_policy("tensorlake/clip-extractor", name="clip-gif", labels_eq="content:image")

In [None]:
client.add_extraction_policy("tensorlake/minilm-l6", name="minilm-description")

In [None]:
client.indexes()

In [None]:
import requests
res = requests.get("https://raw.githubusercontent.com/raingo/TGIF-Release/master/data/tgif-v1.0.tsv")
items = res.text.split("\n")

# Ingest data

In [None]:
for item in items[0:1000]:
    url, text = item.split('\t')
    # validate image
    r = requests.get(url)
    if r.headers.get("Content-Type") != "image/gif":
        print("image removed", url)
        continue
    print("ingesting", url)
    client.ingest_remote_file(url, "image/gif", {"url":url, "content":"image"})
    client.add_documents(Document(text=text, labels={"url": url}))

# Search Data

In [None]:
query = "cats being curious"
max_results = 10

In [None]:
minilm_results = client.search_index("minilm-description.embedding", "person dancing on camera", max_results)
clip_results = client.search_index("clip-gif.embedding", "person dancing on camera", max_results)

#### Merge results

In [None]:
results = set()
for i in range(max_results):
    minilm_url = minilm_results[i].get("labels",{}).get("url")
    if minilm_url and minilm_url not in results:
        results.add(minilm_url)

    clip_url = clip_results[i].get("labels",{}).get("url")
    if clip_url and clip_url not in results:
        results.add(clip_url)

In [None]:
list(results)