In [None]:
!pip install tmdbv3api pandas numpy scikit-learn sentence_transformers gradio ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117

import gradio as gr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch
import clip
import requests
from PIL import Image
from io import BytesIO
from tmdbv3api import TMDb, Movie


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import MultiLabelBinarizer


from textblob import TextBlob


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


# Initialisation de TMDB
tmdb = TMDb()
tmdb.api_key = ''
tmdb.language = 'en'

# Récupérer les films avec pagination
def fetch_movies(num_movies=200):
    movie_api = Movie()
    all_movies = []
    page = 1
    while len(all_movies) < num_movies:
        movies = movie_api.popular(page=page)
        if not movies:
            break

        for movie in movies:
            if len(all_movies) >= num_movies:
                break
            try:
                details = movie_api.details(movie.id)
                all_movies.append({
                    'title': movie.title,
                    'overview': movie.overview or "",
                    'poster_url': f"https://image.tmdb.org/t/p/w500{movie.poster_path}" if movie.poster_path else None,
                    'genres': [g['name'] for g in details.genres],
                    'vote_average': movie.vote_average
                })
            except Exception as e:
                print(f"Skipping movie {movie.id}: {str(e)}")
        page += 1
    return pd.DataFrame(all_movies)

# Nettoyage de texte avec Lemmatisation
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    return ' '.join([
        lemmatizer.lemmatize(word)
        for word in text.split()
        if word not in stop_words
    ])


# Initialize data and models
df = fetch_movies(200)
df['cleaned_overview'] = df['overview'].apply(clean_text)
df['combined_features'] = df['cleaned_overview'] + ' ' + df['genres'].apply(lambda x: ' '.join(x))


# Retrouver les genres
all_genres = sorted({genre for sublist in df['genres'] for genre in sublist})

# TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Sentence Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(df['combined_features'])

# CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
try:
    clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
    clip_text_embeddings = np.zeros((len(df), 512))  # Not used
    # Generer les posters
    poster_embeddings = []
    for url in df['poster_url']:
        try:
            if url:
                response = requests.get(url, timeout=10)
                img = Image.open(BytesIO(response.content))
                with torch.no_grad():
                    image = clip_preprocess(img).unsqueeze(0).to(device)
                    emb = clip_model.encode_image(image).cpu().numpy()
            else:
                emb = np.zeros((1, 512))
        except Exception as e:
            print(f"Error processing poster: {str(e)}")
            emb = np.zeros((1, 512))
        poster_embeddings.append(emb)
    poster_embeddings = np.vstack(poster_embeddings)
except Exception as e:
    print(f"CLIP initialization failed: {e}")
    clip_model = None
    poster_embeddings = np.zeros((len(df), 512))


# fonction recherche hybride
def hybrid_search(query, use_text=True, use_poster=True):

    # Initializer les scores
    tfidf_scores = np.zeros(len(df))
    emb_scores = np.zeros(len(df))
    clip_scores = np.zeros(len(df))

    # Corriger l'orthographe
    try:
        corrected_query = str(TextBlob(query).correct())
        print(f"Original: '{query}' → Corrected: '{corrected_query}'")
        query = corrected_query
    except Exception as e:
        print(f"Spell correction failed: {e}")

    if use_text:
        # TF-IDF
        tfidf_scores = cosine_similarity(tfidf.transform([query]), tfidf_matrix).flatten()

        # Word Embeddings
        emb_scores = cosine_similarity(embedding_model.encode([query]),embeddings).flatten()

        # Normalizer les scores
        tfidf_scores = (tfidf_scores - tfidf_scores.min()) / (np.ptp(tfidf_scores) + 1e-8)
        emb_scores = (emb_scores - emb_scores.min()) / (np.ptp(emb_scores) + 1e-8)

    if use_poster and clip_model is not None:
        # CLIP
        with torch.no_grad():
            text_input = clip.tokenize([query]).to(device)
            text_emb = clip_model.encode_text(text_input).cpu().numpy()
            clip_scores = cosine_similarity(text_emb, poster_embeddings).flatten()

        # Normalize score clip
        clip_scores = (clip_scores - clip_scores.min()) / (np.ptp(clip_scores) + 1e-8)


    text_weight = 0.5 if (use_text and tfidf_scores.sum() > 0 and emb_scores.sum() > 0) else 1.0
    combined_scores = (0.5 * text_weight * tfidf_scores + 0.5 * text_weight * emb_scores +
                      1 * clip_scores) if use_poster else (0.7 * tfidf_scores + 0.3 * emb_scores)

    # retourner les resultat trié par le score
    all_indices = np.argsort(combined_scores)[::-1]
    return all_indices, combined_scores[all_indices]


# affichage
def display_results(query, page, use_text, use_poster):
    results_per_page = 6
    if not query.strip():
        return "", 1, gr.update(visible=False)

    indices, scores = hybrid_search(query, use_text=use_text, use_poster=use_poster)
    total_results = len(indices)

    # filtrage des films
    filtered_indices = []
    filtered_scores = []
    for idx, score in zip(indices, scores):
        filtered_indices.append(idx)
        filtered_scores.append(score)

    total_results = len(filtered_indices)

    # calcule de pagination
    total_pages = max(1, (total_results + results_per_page - 1) // results_per_page)
    page = max(1, min(page, total_pages))
    start_idx = (page - 1) * results_per_page
    end_idx = start_idx + results_per_page

    # construire resultat pour la page
    results_html = ['<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 15px; margin-bottom: 20px;">']

    for i in range(start_idx, min(end_idx, total_results)):
        idx = indices[i]
        movie = df.iloc[idx]
        results_html.append(f"""
        <div style="border:1px solid #ddd; padding:10px; border-radius:8px;">
            <img src="{movie['poster_url'] or 'https://via.placeholder.com/150x225?text=No+Poster'}"
                 width="100%" style="border-radius:5px; margin-bottom:8px;">
            <div>
                <h4 style="margin:0; font-size:0.9em">{movie['title']}</h4>
                <div style="color:#ff6b6b; font-size:0.8em">⭐ {movie['vote_average']} | Relevance: {scores[i]:.3f}</div>
                <p style="font-size:0.8em; margin:5px 0"><b>Genres:</b> {', '.join(movie['genres'][:3])}</p>
                <p style="font-size:0.8em; margin:0">{movie['overview'][:100]}...</p>
            </div>
        </div>
        """)

    results_html.append('</div>')

    pagination_info = f"Showing {start_idx+1}-{min(end_idx, total_results)} of {total_results} results (Page {page}/{total_pages})"
    show_pagination = total_results > results_per_page


    return "".join(results_html), pagination_info, gr.update(visible=show_pagination), page


# interface gradio
with gr.Blocks(theme=gr.themes.Soft(), css="""
    .gradio-container {max-width: 1200px !important}
    .movie-card {
        transition: transform 0.2s;
    }
    .movie-card:hover {
        transform: scale(1.02);
        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
    }
    .genre-btn {
        margin: 2px;
        min-width: 80px;
        padding: 3px 6px;
        font-size: 0.8em;
        height: 28px;
        flex: 1 0 30%;
        box-sizing: border-box;
    }
    .genre-row {
        display: flex;
        flex-wrap: wrap;
        gap: 5px;
    }
""") as app:
    gr.Markdown("# TMDB Movie Search")

    current_page = gr.State(1)
    current_query = gr.State("")
    current_weights = gr.State((True, True))


    with gr.Row():
        with gr.Column(scale=4):
            search_input = gr.Textbox(label="Search Movie", placeholder="Type the name of a movie or try the genre options")
            search_btn = gr.Button("Search", variant="primary")

            results_display = gr.HTML()
            pagination_info = gr.Markdown()

            # pagination
            with gr.Row(visible=False) as pagination_row:
                prev_btn = gr.Button("← Previous")
                page_number = gr.Number(1, label="Current Page", interactive=True)
                next_btn = gr.Button("Next →")

        with gr.Column(scale=1):
            with gr.Accordion("Search Options", open=False):
                text_search = gr.Checkbox(value=True, label="Text Search")
                poster_search = gr.Checkbox(value=True, label="Poster Search")
            gr.Markdown("### Browse by Genre")
            with gr.Row(elem_classes="genre-row"):
                for genre in all_genres:
                    btn = gr.Button(genre, elem_classes="genre-btn")
                    btn.click(
                        fn=lambda g=genre: g,
                        outputs=search_input
                    ).then(
                         fn=lambda g=genre: (g, True, True),
                         outputs=[current_query, current_weights]
                    ).then(
                         fn=lambda q, w: display_results(q, 1, w[0], w[1]),
                         inputs=[current_query, current_weights],
                         outputs=[results_display, pagination_info, pagination_row, current_page]
                    )

    def update_weights_info(tfidf_w, emb_w, clip_w):
        total = tfidf_w + emb_w + clip_w
        normalized = [w/total for w in [tfidf_w, emb_w, clip_w]] if total > 0 else [0.33, 0.33, 0.34]
        return f"Normalized weights: TF-IDF {normalized[0]:.2f}, Embeddings {normalized[1]:.2f}, CLIP {normalized[2]:.2f}"

    # Event handlers
    def perform_search(query, use_text, use_poster):
       results_per_page = 5
       return query, (use_text, use_poster), *display_results(query, 1, use_text, use_poster)

    def change_page(direction, query, weights, current_page):
        new_page = current_page + direction
        return *display_results(query, new_page, weights[0], weights[1]), new_page


    search_input.submit(
         perform_search,
         [search_input, text_search, poster_search],
         [current_query, current_weights, results_display, pagination_info, pagination_row, current_page]
    )
    search_btn.click(
         perform_search,
         [search_input, text_search, poster_search],
         [current_query, current_weights, results_display, pagination_info, pagination_row, current_page]
    )

    def search_by_genre(genre):
         return genre

    prev_btn.click(
        lambda q, w, p: change_page(-1, q, w, p),
        [current_query, current_weights, current_page],
        [results_display, pagination_info, pagination_row, current_page]
    )

    next_btn.click(
         lambda q, w, p: change_page(1, q, w, p),
         [current_query, current_weights, current_page],
         [results_display, pagination_info, pagination_row, current_page]
    )

    # mise a jour num de page
    page_number.change(
        lambda p, q, w: (display_results(q, p, w[0], w[1]) + (p,)),
        [page_number, current_query, current_weights],
        [results_display, pagination_info, pagination_row, current_page]
    )



app.launch()

Collecting tmdbv3api
  Downloading tmdbv3api-1.9.0-py3-none-any.whl.metadata (8.0 kB)
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading tmdbv3api-1.9.0-py3-none-any.whl (25 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, tmdbv3api
Successfully installed ftfy-6.3.1 tmdbv3api-1.9.0
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-k9br_c4r
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-k9br_c4r
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|███████████████████████████████████████| 338M/338M [00:04<00:00, 77.7MiB/s]


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3abf0116f46480de71.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


