In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
import streamlit as st
import numpy as np


In [None]:
# URL to CSV file
url = 'https://raw.githubusercontent.com/datum-oracle/netflix-movie-titles/main/titles.csv'

# Read the CSV file directly from the URL
df = pd.read_csv(url)

# Clean the DataFrame
df['description'].fillna('No description available', inplace=True)
df['age_certification'].fillna('Not Rated', inplace=True)
df['seasons'].fillna(0, inplace=True)
df['seasons'] = df['seasons'].astype(int)
df['imdb_id'].fillna('Unknown', inplace=True)
df['imdb_score'].fillna(df['imdb_score'].mean(), inplace=True)
df['imdb_votes'].fillna(df['imdb_votes'].median(), inplace=True)
df['tmdb_popularity'].fillna(df['tmdb_popularity'].mean(), inplace=True)
df['tmdb_score'].fillna(df['tmdb_score'].mean(), inplace=True)
df['release_year'] = df['release_year'].astype(int)
df['runtime'] = df['runtime'].astype(int)
df['imdb_score'] = df['imdb_score'].astype(float)
df['imdb_votes'] = df['imdb_votes'].astype(int)
df['tmdb_popularity'] = df['tmdb_popularity'].astype(float)
df['tmdb_score'] = df['tmdb_score'].astype(float)

# Create a combined features column
df['combined_features'] = (
    df['title'].astype(str) + " " +
    df['type'].astype(str) + " " +
    df['description'].astype(str) + " " +
    df['genres'].astype(str) + " " +
    df['production_countries'].astype(str) + " " +
    df['age_certification'].astype(str) + " " +
    df['release_year'].astype(str)
)


In [None]:
# Function to split documents into chunks
def chunk_data(doc, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(doc)
    return chunks

# Apply the splitter to the combined features
df['chunks'] = df['combined_features'].apply(chunk_data)


In [None]:
# Load the Hugging Face model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each chunk
df['chunk_embeddings'] = df['chunks'].apply(lambda chunks: [model.encode(chunk) for chunk in chunks])


In [None]:
# Function to search movies based on user query
def search_movies(query, top_k=5):
    # Generate the embedding for the user's query
    query_embedding = model.encode(query).reshape(1, -1)
    
    # Calculate cosine similarity between query embedding and each chunk's embeddings
    similarities = []
    for idx, chunk_embeds in df['chunk_embeddings'].items():
        for chunk_embed in chunk_embeds:
            sim = cosine_similarity(query_embedding, np.array(chunk_embed).reshape(1, -1)).flatten()[0]
            similarities.append((sim, idx))

    # Sort by similarity
    similarities = sorted(similarities, key=lambda x: x[0], reverse=True)
    
    # Get the top_k most similar movies
    top_k_indices = list(dict(similarities[:top_k]).values())
    results = df.iloc[top_k_indices]
    return results


In [None]:
# Streamlit UI
st.title('Movie Recommendation Engine')

# User input
user_query = st.text_input('Enter movie description, genre, etc.', '')

if user_query:
    # Search movies
    results = search_movies(user_query)

    # Display results
    for _, row in results.iterrows():
        st.write(f"**Title:** {row['title']}")
        st.write(f"**Description:** {row['description']}")
        st.write(f"**Genres:** {row['genres']}")
        st.write(f"**IMDB Score:** {row['imdb_score']}")
        st.write(f"**Release Year:** {row['release_year']}")
        st.write("---")

