<a href="https://colab.research.google.com/github/Achuttarsing/Slow-Fast-pytorch-implementation/blob/master/semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def url_to_image(url):
	# download the image, convert it to a NumPy array, and then read
	# it into OpenCV format
	resp = urllib.request.urlopen(url)
	image = np.asarray(bytearray(resp.read()), dtype="uint8")
	image = cv2.imdecode(image, cv2.IMREAD_COLOR)
	# return the image
	return image

import ast
def read_csv_shot(path):
    s = pd.read_csv(path, index_col=0)
    for feature_name in ['detected_things','detected_things_more','talking_persons_in_scene','talking_persons_in_shot','detected_people']:
        if feature_name in s.columns:
            s[feature_name] = s[feature_name].fillna("[]")
            s[feature_name] = s[feature_name].apply(ast.literal_eval)
    return s

import clip
import torch

import datetime
from IPython.core.display import HTML
import time

def search_video(search_query, return_urls=False, top_n=10):
    start = time.time()

    # Encode and normalize the search query using CLIP
    with torch.no_grad():
        text_features = model.encode_text(clip.tokenize(search_query).to(device))
        text_features /= text_features.norm(dim=-1, keepdim=True)

    # Compute the similarity between the search query and each frame using the Cosine similarity
    similarities = (100.0 * VIDEO_FEATURES.float() @ text_features.T)
    values, best_photo_idx = similarities.topk(top_n, dim=0)
    best_photo_idx = [x.item() for x in best_photo_idx]

    computing_time = round(time.time() - start, 4)
    print("computing time =",computing_time,"sec")

    if return_urls == False:
        EXPORT = {}
        for sim, frame_id in zip(values, best_photo_idx):
            mask = (LENS <= frame_id)
            id = MOVIE_IDS[mask.sum() - 1]
            if id not in EXPORT: 
                EXPORT[id] = {}
                EXPORT[id]['shot_ids'] = []
                EXPORT[id]['similarity_scores'] = []

            EXPORT[id]['shot_ids'].append(int((frame_id - LENS[mask][-1])+1))
            EXPORT[id]['similarity_scores'].append(sim.item())

        return EXPORT, computing_time

    else:
        EXPORT = {}
        URLS = []
        PROBAS = []
        for sim, frame_id in zip(values, best_photo_idx):
            mask = (LENS <= frame_id)
            id = MOVIE_IDS[mask.sum() - 1]
            if id not in EXPORT: 
                EXPORT[id] = {}
                EXPORT[id]['shot_ids'] = []
                EXPORT[id]['similarity_scores'] = []

            EXPORT[id]['shot_ids'].append(int((frame_id - LENS[mask][-1])+1))
            EXPORT[id]['similarity_scores'].append(sim.item())
            URLS.append('https://ateliernumerique.ensci.com/oucipo/data/PROCESSED_MOVIES/'+MOVIE_TITLES[mask.sum() - 1]+'/image_shots/shot-'+str(EXPORT[id]['shot_ids'][-1]).zfill(4)+'-01.jpg')
            PROBAS.append(sim.item())

        return EXPORT, computing_time, URLS, PROBAS

# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

import pandas as pd
from PIL import Image
from ftplib import FTP
from tqdm import tqdm
import urllib.request
import numpy as np
import os, json, random

ftp = FTP(host='ateliernumerique.ensci.com', user='ftp-oucipo-data', passwd='oucipo')  # connect to host, default port
ftp.cwd('/data/PROCESSED_MOVIES')
PROCESSED_MOVIES = ftp.nlst()

VIDEO_FEATURES = {}
MOVIE_TITLES = []
for FOLDER in tqdm(PROCESSED_MOVIES, desc='Loading clip encodings'):
    try:
        ftp.cwd('/data/PROCESSED_MOVIES/'+FOLDER)

        # load shots
        ftp.retrbinary("RETR shots.csv" ,open('shots.csv', 'wb').write)
        shot = read_csv_shot('shots.csv')
        MOVIE_ID = str(shot.at[1,'movie_id']).zfill(7)

        ftp.retrbinary("RETR clip_encoding" ,open('clip_encoding','wb').write)
        VIDEO_FEATURES[MOVIE_ID] = torch.load("clip_encoding",map_location=torch.device('cpu'))
        MOVIE_TITLES.append(FOLDER)
    except:
        print("cant process :",FOLDER)

LENS = np.array([0] + [VIDEO_FEATURES[x].shape[0] for x in VIDEO_FEATURES]).cumsum()
MOVIE_IDS = [x for x in VIDEO_FEATURES]
VIDEO_FEATURES = torch.cat([VIDEO_FEATURES[x] for x in VIDEO_FEATURES])

MOVIE_TITLES

In [None]:
from flask import Flask, render_template, request
from flask import Flask
import simplejson

app = Flask(__name__)

@app.route("/")
def home():
    return '''<!doctype html>
	<html lang="fr">
		<head>
			<meta charset="utf-8">
			<title>Le formulaire</title>
		</head>
		<body>
			<form action="/resultat" method="post">
					<label>text input</label> : <input type="text" name="text_input" />
					<input type="submit" value="Envoyer" />
			</form>
		</body>
	</html>'''
  

@app.route('/resultat',methods = ['POST'])
def resultat():
    result = request.form
    text_input = result['text_input']
    print("REQUEST =",text_input)
    EXPORT, computing_time, URLS, PROBAS = search_video(text_input, return_urls=True, top_n=1000)
    return simplejson.dumps(EXPORT, ignore_nan=True)
    #return "<h1> text input : \""+text_input+"\""+"</h1>"+ "<h4>" + "computing time = " + str(computing_time) +" sec" +"</h4>" + ' '.join(["<img src=\""+t+"\">"+str(PROBAS[c]) for c,t in enumerate(URLS)])


app.run()