In [1]:
# a python library to tokenize and use english text
!python -m spacy download en_core_web_sm

# installing open AI so we can use its text generation
!pip install openai==0.27.7

# its an interface that allows users to ask questions and get responses easily
!pip install gradio

# installing sentence transformers to convert sentences into numbers and bm25 that was a ranking algorithm
!pip install -U sentence-transformers rank_bm25

# installing FAISS for fast retrieval of embeddings
!pip install faiss-cpu

!pip install datasets

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting openai==0.27.7
  Downloading openai-0.27.7-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.27.7-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found

In [2]:
import numpy as np
import pandas as pd
import json
import time # measures how long it takes to process the data
import spacy # it deals with lannguage processing
from spacy.lang.en.stop_words import STOP_WORDS # import stop words like the/a/and etc
from string import punctuation # import eg .,!?:;
from collections import Counter # count the frequency of words etc
from heapq import nlargest #retrieves n largest numbers from a list
import nltk # used for text processing tasks
import re
import gzip
import os
import torch
from tqdm import tqdm # it adds a process bar to loop
from sentence_transformers import SentenceTransformer, CrossEncoder, util # to create sentence embeddings, converting sentences to numbers
# import tiktoken
from openai.embeddings_utils import get_embedding, cosine_similarity # cosine measures the similarity bw 2 vectors
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

  from tqdm.autonotebook import tqdm, trange


In [3]:
from google.colab import files
uploaded = files.upload()
# this sets the encoding to UTF-8 which helps when dealing with diff languages, to process the text properly
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

Saving tmdb_5000_movies.csv to tmdb_5000_movies.csv
Saving tmdb_5000_credits.csv to tmdb_5000_credits.csv


In [4]:
# connectinng to drive to import the file
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

In [5]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

movies_full = movies.merge(credits, on='title')
movies2 = movies_full[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity', 'vote_average', 'production_companies', 'production_countries', 'revenue', 'runtime']]
movies2.dropna(inplace=True)
movies2.isnull().sum()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies2.dropna(inplace=True)


Unnamed: 0,0
movie_id,0
title,0
overview,0
genres,0
keywords,0
cast,0
crew,0
popularity,0
vote_average,0
production_companies,0


In [6]:
import ast

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies2.genres = movies2['genres'].apply(convert)
movies2.keywords = movies2.keywords.apply(convert)
movies2.cast = movies2.cast.apply(convert3)
movies2.overview = movies2.overview.apply(lambda x:x.split())
movies2.crew = movies2.crew.apply(fetch_director)
movies2.production_companies = movies2['production_companies'].apply(convert)
movies2.production_countries = movies2['production_countries'].apply(convert)

movies2.genres = movies2.genres.apply(lambda x:[i.replace(" ", "") for i in x])
movies2.keywords = movies2.keywords.apply(lambda x:[i.replace(" ", "") for i in x])
movies2.cast = movies2.cast.apply(lambda x:[i.replace(" ", "") for i in x])
movies2.crew = movies2.crew.apply(lambda x:[i.replace(" ", "") for i in x])
movies2.production_companies = movies2.production_companies.apply(lambda x:[i.replace(" ", "") for i in x])
movies2.production_countries = movies2.production_countries.apply(lambda x:[i.replace(" ", "") for i in x])


# Create a proxy "popularity-based sentiment score" using revenue and popularity.
# Normalize the Revenue and Popularity
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
movies2[['revenue_normalized', 'popularity_normalized']] = scaler.fit_transform(movies2[['revenue', 'popularity']])

# Compute the Sentiment Score
def compute_sentiment_score(row):
    return (row['revenue_normalized'] * 0.5) + (row['popularity_normalized'] * 0.5)

movies2['sentiment_score'] = movies2.apply(compute_sentiment_score, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies2.genres = movies2['genres'].apply(convert)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies2.keywords = movies2.keywords.apply(convert)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies2.cast = movies2.cast.apply(convert3)
A value is trying to be set on a copy of a slice from a Data

In [7]:
movies2.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,popularity,vote_average,production_companies,production_countries,revenue,runtime,revenue_normalized,popularity_normalized,sentiment_score
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],150.437577,7.2,"[IngeniousFilmPartners, TwentiethCenturyFoxFil...","[UnitedStatesofAmerica, UnitedKingdom]",2787965087,162.0,1.0,0.171815,0.585907
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],139.082615,6.9,"[WaltDisneyPictures, JerryBruckheimerFilms, Se...",[UnitedStatesofAmerica],961000000,169.0,0.344696,0.158846,0.251771


In [8]:
top_10_movies = movies2[['title','vote_average', 'revenue', 'popularity', 'sentiment_score']].sort_values(by='sentiment_score', ascending=False).head(10)
top_10_movies

Unnamed: 0,title,vote_average,revenue,popularity,sentiment_score
546,Minions,6.4,1156730962,875.581305,0.707451
0,Avatar,7.2,2787965087,150.437577,0.585907
95,Interstellar,8.1,675120017,724.247784,0.534659
28,Jurassic World,6.5,1513528810,418.708552,0.510543
788,Deadpool,7.4,783112979,514.569956,0.43429
94,Guardians of the Galaxy,7.9,773328629,481.098624,0.413421
25,Titanic,7.5,1845034188,100.025899,0.388012
16,The Avengers,7.4,1519557910,144.448633,0.355008
7,Avengers: Age of Ultron,7.3,1405403694,134.279229,0.328728
44,Furious 7,7.3,1506249360,102.322217,0.328565


In [9]:
# Sort by the 'vote_average' column in descending order
top_movies = movies2[['title', 'vote_average']].sort_values(by='vote_average', ascending=False).head(5)

# Display the top 5 movies with the highest ratings
print(top_movies)

                      title  vote_average
4252  Me You and Five Bucks          10.0
4668         Little Big Top          10.0
4050  Dancer, Texas Pop. 81          10.0
3522       Stiff Upper Lips          10.0
3997              Sardaarji           9.5


# Creating a Search Engine

In [10]:
movies2.to_pickle('/content/drive/MyDrive/entire_data.pkl')
df = pd.read_pickle('/content/drive/MyDrive/entire_data.pkl') #to load entire_data.pkl back to the dataframe df

In [11]:
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,popularity,vote_average,production_companies,production_countries,revenue,runtime,revenue_normalized,popularity_normalized,sentiment_score
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],150.437577,7.2,"[IngeniousFilmPartners, TwentiethCenturyFoxFil...","[UnitedStatesofAmerica, UnitedKingdom]",2787965087,162.0,1.0,0.171815,0.585907


In [12]:
df['overview'] = df['overview'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]','',str(x)))

df["combined"] = (
    "Movie name is " + df.title.str.strip() +
    "; Overview: " + df['overview'].apply(lambda x: ''.join(x)) +
    "; Genres: " + df['genres'].apply(lambda x: ', '.join(x)) +
    "; Keywords: " + df['keywords'].apply(lambda x: ', '.join(x)) +
    "; Cast: " + df['cast'].apply(lambda x: ', '.join(x)) +
    "; Director is " + df['crew'].apply(lambda x: ', '.join(x)) +
    "; Production Companies: " + df['production_companies'].apply(lambda x: ', '.join(x)) +
    "; Production Countries: " + df['production_countries'].apply(lambda x: ', '.join(x)) +
    "; The runtime of movie is " + df['runtime'].astype(str) + " minutes" +
    "; Rating is " + df['vote_average'].map("{:.1f}".format) +
    "; The movie popularity out of 1 is  " + df["popularity_normalized"].map("{:.3f}".format) +
    "; Sentiment Score: " + movies2["sentiment_score"].astype(str)
)




def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

# Apply the cleaning function to the combined column
df['combined'] = df['combined'].apply(lambda x: lower_case(x))

In [13]:
# Data preview
df['combined'][0]

'movie name is avatar; overview: in the 22nd century a paraplegic marine is dispatched to the moon pandora on a unique mission but becomes torn between following orders and protecting an alien civilization; genres: action, adventure, fantasy, sciencefiction; keywords: cultureclash, future, spacewar, spacecolony, society, spacetravel, futuristic, romance, space, alien, tribe, alienplanet, cgi, marine, soldier, battle, loveaffair, antiwar, powerrelations, mindandsoul, 3d; cast: samworthington, zoesaldana, sigourneyweaver; director is jamescameron; production companies: ingeniousfilmpartners, twentiethcenturyfoxfilmcorporation, duneentertainment, lightstormentertainment; production countries: unitedstatesofamerica, unitedkingdom; the runtime of movie is 162.0 minutes; rating is 7.2; the movie popularity out of 1 is  0.172; sentiment score: 0.5859072573505895'

#Model Fine-tuning

In [14]:
# # this is a pre trained model in the Sentence transformers which create sentence embeddings: sentence to numbers
# embedder = SentenceTransformer('all-mpnet-base-v2')
# # GPU processes the model faster, so its moving yours to the GPU
# import torch
# if not torch.cuda.is_available():
#     print("Warning: No GPU found. Please add GPU to your notebook")
# else:
#   print("GPU Found!")
#   embedder =  embedder.to('cuda')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

GPU Found!


In [15]:
# startTime = time.time()

# # 'combined' column is turned to embeddings by the model.
# df["embedding"] = df['combined'].apply(lambda x: embedder.encode(x, convert_to_tensor=True))

# executionTime = (time.time() - startTime)
# print('Execution time in seconds: ' + str(executionTime))

KeyboardInterrupt: 

In [16]:
# import random
# from datasets import Dataset
# from sentence_transformers import SentenceTransformer, InputExample, losses
# from torch.utils.data import DataLoader

# df['genres_str'] = df['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

# # Create similar and dissimilar movie pairs for fine-tuning
# def create_pairs(df, n_pairs=1000):
#     similar_pairs = []
#     dissimilar_pairs = []

#     # Group movies by genre
#     genre_groups = df.groupby('genres_str')

#     # Create similar pairs within the same genre
#     for genre, group in genre_groups:
#         group = group.sample(min(len(group), 10))  # Sample movies per genre
#         for i, movie1 in group.iterrows():
#             for j, movie2 in group.iterrows():
#                 if i != j:
#                     similar_pairs.append(
#                         InputExample(texts=[movie1['combined'], movie2['combined']], label=1.0)
#                     )

#     # Create dissimilar pairs between different genres
#     for _ in range(n_pairs):
#         genre1, genre2 = random.sample(list(genre_groups.groups.keys()), 2)
#         movie1 = genre_groups.get_group(genre1).sample(1).iloc[0]
#         movie2 = genre_groups.get_group(genre2).sample(1).iloc[0]
#         dissimilar_pairs.append(
#             InputExample(texts=[movie1['combined'], movie2['combined']], label=0.0)
#         )

#     # Combine and shuffle the pairs
#     all_pairs = similar_pairs + dissimilar_pairs
#     random.shuffle(all_pairs)
#     return all_pairs

# # Step 2: Generate pairs for fine-tuning
# train_examples = create_pairs(df, n_pairs=1000)

In [None]:
# import os
# os.environ["WANDB_DISABLED"] = "true"

In [None]:
# from transformers import TrainingArguments, Trainer

# # Prepare DataLoader and define the loss function for fine-tuning
# train_dataset = [InputExample(texts=[example.texts[0], example.texts[1]], label=example.label) for example in train_examples]
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)  # Reduced batch size to avoid OOM

# # Define the loss function
# train_loss = losses.CosineSimilarityLoss(model=embedder)

# # Clear GPU cache
# torch.cuda.empty_cache()

# # Training arguments: set gradient accumulation to simulate larger batch size
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=1,  # Adjust the number of epochs as needed
#     per_device_train_batch_size=4,  # Adjust batch size based on available memory
#     gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
#     evaluation_strategy="epoch",
#     logging_dir="./logs",
# )

# # Fine-tune the model with the reduced memory usage settings
# embedder.fit(
#     train_objectives=[(train_dataloader, train_loss)],
#     epochs=3,  # Reduce epochs if running out of memory
#     warmup_steps=100,
#     show_progress_bar=True
# )

In [None]:
# # Save the fine-tuned model
# output_model_path = "/content/drive/MyDrive/Movie_Recommend/fine_tuned_model"
# embedder.save(output_model_path)

In [17]:
# Loading my fine-tuned model
output_model_path = "/content/drive/MyDrive/Movie_Recommend/fine_tuned_model"
embedder = SentenceTransformer(output_model_path)
print("Fine-tuned model loaded successfully.")

Fine-tuned model loaded successfully.


In [18]:
startTime = time.time()

# 'combined' column is turned to embeddings by the model.
df["embedding"] = df['combined'].apply(lambda x: embedder.encode(x, convert_to_tensor=True))

executionTime = (time.time() - startTime)
print('Execution time in seconds: ' + str(executionTime))

Execution time in seconds: 83.33886671066284


In [19]:
# Function to extract movie titles, directors, actors, and production companies from the query
def extract_movie_entities(query, movie_titles, movie_crew, movie_cast, movie_production):
    # Normalize query and movie entities for comparison
    normalized_query = query.lower()
    mentioned_entities = set()  # Use a set to avoid duplicate entries

    # Match movie titles
    for title in movie_titles:
        if len(title) > 2:  # Ignore titles that are very short (e.g., '42', '9')
            pattern = r'\b' + re.escape(title.lower()) + r'\b'
            if re.search(pattern, normalized_query):
                mentioned_entities.add(title)

    # Match directors (movie_crew)
    for crew_list in movie_crew:  # Iterate over list of crew lists
        if isinstance(crew_list, list):
            for crew_member in crew_list:
                if crew_member and crew_member.lower() in normalized_query:
                    mentioned_entities.add(crew_member)

    # Match actors (movie_cast)
    for cast_list in movie_cast:  # Iterate over list of cast lists
        if isinstance(cast_list, list):
            for cast_member in cast_list:
                if cast_member and cast_member.lower() in normalized_query:
                    mentioned_entities.add(cast_member)

    # Match production companies (movie_production)
    for production_list in movie_production:  # Iterate over list of production company lists
        if isinstance(production_list, list):
            for production_company in production_list:
                if production_company and production_company.lower() in normalized_query:
                    mentioned_entities.add(production_company)

    return list(mentioned_entities)  # Convert the set back to a list

In [20]:
import faiss

# Normalizing function for scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    scores = np.array(scores).reshape(-1, 1)  # Ensure it's a NumPy array and Reshape for MinMaxScaler
    normalized_scores = scaler.fit_transform(scores)
    return normalized_scores.flatten()

# Create a FAISS index for fast similarity search
embedding_matrix = np.vstack([embedding.cpu().numpy() for embedding in df['embedding'].values])
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 distance (Cosine similarity)
faiss.normalize_L2(embedding_matrix)  # Normalize embeddings for cosine similarity
index.add(embedding_matrix)

df['genres_str'] = df['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

# Search function
def search(query, n=5):

    # Generate query embedding
    query_embedding = embedder.encode(query).reshape(1, -1)  # Ensure query embedding is 2D

    # Extract mentioned movies, directors, actors, and production companies from the query
    mentioned_entities = extract_movie_entities(query, df['title'], df['crew'], df['cast'], df['production_companies'])
    print("Entities mentioned in query and excluded: ", mentioned_entities)

    # BM25 search on tokenized corpus
    tokenized_corpus = [doc.split(" ") for doc in df['combined']]
    bm25 = BM25Okapi(tokenized_corpus)
    query_tokens = query.split(" ")
    bm25_scores = bm25.get_scores(query_tokens)

    # FAISS similarity search
    faiss.normalize_L2(query_embedding)  # Normalize query embedding for cosine similarity
    D, I = index.search(query_embedding, n * 3)  # Get more results initially to filter out mentioned movies

    # Add BM25 and FAISS scores to the DataFrame
    df['bm25_score'] = bm25_scores
    df['cosine_similarity'] = np.zeros(len(df))
    for idx, score in zip(I[0], D[0]):
        df.at[idx, 'cosine_similarity'] = 1 - score  # FAISS returns L2 distance, convert it to similarity

    # Normalize BM25, cosine similarity scores, and vote average
    df['bm25_score_normalized'] = normalize_scores(df['bm25_score'])
    df['cosine_similarity_normalized'] = normalize_scores(df['cosine_similarity'])
    df['rating_normalized'] = normalize_scores(df['vote_average'])

    # Combine scores with more weight to metadata (director, actor, production) if found
    df['combined_score'] = (
        (df['cosine_similarity_normalized'] * 0.35) +
        (df['bm25_score_normalized'] * 0.35) +
        (df['sentiment_score'] * 0.15) +
        (df['rating_normalized'] * 0.15)
    )

    # Sort by combined score and get the top 'n' results
    results = df.sort_values(by='combined_score', ascending=False).head(n)
    print(f"Initial results: {len(results)}")

    # Filter out movies that were mentioned in the query, as well as directors, actors, or production companies
    results = results[~results['title'].isin(mentioned_entities)].head(n)
    print(f"Results after filtering mentioned entities: {len(results)}")

    # Ensure we still get 'n' results even after filtering
    results = results.head(n)

    resultlist = []

    # Display results in a concise format
    hlist = []
    for r in results.index:
        if results.title[r] not in hlist:
            # Get the row corresponding to the specific movie
            smalldf = results.loc[results.title == results.title[r]]

            # Combine directors and cast in one line
            combined_cast_crew = ', '.join(smalldf.cast.iloc[0]) + '; Directed by: ' + ', '.join(smalldf.crew.iloc[0])

            resultlist.append(
                {
                    "Movie Name": smalldf.title.iloc[0],
                    "Score": smalldf.combined_score.iloc[0],
                    "Rating": smalldf.vote_average.iloc[0],
                    "Genre": ', '.join(smalldf.genres.iloc[0]),  # Include the genres
                    "Cast & Directors": combined_cast_crew,  # Combining cast and director
                    "Production Companies": ', '.join(smalldf.production_companies.iloc[0]),
                    "Movie overview": smalldf.overview.iloc[0],
                    "Sentiment Score": smalldf.sentiment_score.iloc[0]
                }
            )
            hlist.append(smalldf.title.iloc[0])

    return resultlist

In [21]:
query = "I'm in the mood for a light-hearted romantic comedy, something similar to '10 Things I Hate About You' but with a bit more drama."
search(query)

Entities mentioned in query and excluded:  ['ARTE', 'Arte', 'RTE', '10 Things I Hate About You']
Initial results: 5
Results after filtering mentioned entities: 5


[{'Movie Name': 'Niagara',
  'Score': 0.6572385173220704,
  'Rating': 6.7,
  'Genre': 'Crime, Drama, Thriller',
  'Cast & Directors': 'MarilynMonroe, JosephCotten, JeanPeters; Directed by: HenryHathaway',
  'Production Companies': 'TwentiethCenturyFoxFilmCorporation',
  'Movie overview': 'Rose Loomis and her older gloomier husband George are vacationing at a cabin in Niagara Falls NY The couple befriend Polly and Ray Cutler who are honeymooning in the area Polly begins to suspect that something is amiss between Rose and George and her suspicions grow when she sees Rose in the arms of another man While Ray initially thinks Polly is overreacting things between George and Rose soon take a shockingly dark turn',
  'Sentiment Score': 0.00718007599939596},
 {'Movie Name': 'Silver Linings Playbook',
  'Score': 0.6491633206531392,
  'Rating': 6.9,
  'Genre': 'Drama, Comedy, Romance',
  'Cast & Directors': 'BradleyCooper, JenniferLawrence, RobertDeNiro; Directed by: DavidO.Russell',
  'Producti

In [22]:
query = "Can you recommend a really good science fiction movie, preferably with mind-bending twists like 'Inception' or 'Interstellar'?"
search(query)

Entities mentioned in query and excluded:  ['Inception', 'Good', 'Interstellar']
Initial results: 5
Results after filtering mentioned entities: 5


[{'Movie Name': 'Mars Attacks!',
  'Score': 0.6421239481264758,
  'Rating': 6.1,
  'Genre': 'Comedy, Fantasy, ScienceFiction',
  'Cast & Directors': 'JackNicholson, GlennClose, AnnetteBening; Directed by: TimBurton',
  'Production Companies': 'TimBurtonProductions',
  'Movie overview': 'We come in peace is not what those green men from Mars mean when they invade our planet armed with irresistible weapons and a cruel sense of humor This star studded cast must play victim to the aliens fun and games in this comedy homage to science fiction films of the 50s and 60s',
  'Sentiment Score': 0.04335797330995941},
 {'Movie Name': 'Truth or Dare',
  'Score': 0.589938564877054,
  'Rating': 5.7,
  'Genre': 'Horror, Thriller, Mystery',
  'Cast & Directors': 'LiamBoyle, JackGordon, FlorenceHall; Directed by: RobertHeath',
  'Production Companies': 'CoronaPictures',
  'Movie overview': 'A group of college friends celebrate the end of term with a party to end all parties During a drink and drugfuelle

In [23]:
query = "Recommend a superhero movie with action."
search(query)

Entities mentioned in query and excluded:  ['Superhero Movie']
Initial results: 5
Results after filtering mentioned entities: 4


[{'Movie Name': 'Special',
  'Score': 0.7991081489378656,
  'Rating': 6.6,
  'Genre': 'Drama, Fantasy, ScienceFiction',
  'Cast & Directors': 'MichaelRapaport, JoshPeck, RobertBaker; Directed by: HalHaberman',
  'Production Companies': 'RivalPictures',
  'Movie overview': 'A lonely metermaid has a psychotic reaction to his medication and becomes convinced hes a superhero A very select group of people in life are truly gifted Special is a movie about everyone else',
  'Sentiment Score': 0.0007209929191046786},
 {'Movie Name': 'Birdman',
  'Score': 0.7935294394491882,
  'Rating': 7.4,
  'Genre': 'Drama, Comedy',
  'Cast & Directors': 'MichaelKeaton, EmmaStone, ZachGalifianakis; Directed by: AlejandroGonzálezIñárritu',
  'Production Companies': 'WorldviewEntertainment, NewRegencyPictures, TSGEntertainment, LeGrisbiProductions, MProductions',
  'Movie overview': 'A fading actor best known for his portrayal of a popular superhero attempts to mount a comeback by appearing in a Broadway play 

In [24]:
query = "I want to watch a science fiction movie with a mind-bending plot, like 'Inception' or 'The Matrix'."
search(query)

Entities mentioned in query and excluded:  ['Inception', 'The Matrix']
Initial results: 5
Results after filtering mentioned entities: 5


[{'Movie Name': 'Truth or Dare',
  'Score': 0.6381086387969728,
  'Rating': 5.7,
  'Genre': 'Horror, Thriller, Mystery',
  'Cast & Directors': 'LiamBoyle, JackGordon, FlorenceHall; Directed by: RobertHeath',
  'Production Companies': 'CoronaPictures',
  'Movie overview': 'A group of college friends celebrate the end of term with a party to end all parties During a drink and drugfuelled evening an innocent game of Truth or Dare has a very sore loser sparking a terrifying sequence of events and a whole new twist on the game of truth or dare  where the truth can kill you',
  'Sentiment Score': 0.003814582359087715},
 {'Movie Name': 'Mars Attacks!',
  'Score': 0.6303689647680557,
  'Rating': 6.1,
  'Genre': 'Comedy, Fantasy, ScienceFiction',
  'Cast & Directors': 'JackNicholson, GlennClose, AnnetteBening; Directed by: TimBurton',
  'Production Companies': 'TimBurtonProductions',
  'Movie overview': 'We come in peace is not what those green men from Mars mean when they invade our planet arm

#Building the API

In [25]:
from google.colab import userdata
import openai
openai.api_key = 'sk-jQ7alNFPPrjHlZEMzVHeZCcxTSel0G8aJgjq0YgxzUT3BlbkFJSFeKNtjNxxniKBr_74AfsiVqZ4RiOujhYOsC41-U0A'

In [26]:
def generate_answer(query):

    # Call the search function to get the top movies for the query
    top_movies = search(query)

    # Join the movie names into a bullet list format for the context
    movie_list = "\n".join([f"- **{movie}**" for movie in top_movies])

    prompt = f"""
    You are a warm, creative, and thoughtful movie expert who loves helping people find the perfect films.
    Based on the user's query, recommend three standout movies. For each one, highlight why it's a great fit,
    making connections between the movie's themes, characters, and the user's preferences.

    Your response should feel like an engaging conversation with a friend at a cozy film club. Use vivid language to describe
    the essence of each movie and what makes it special, bringing out the best features in a delightful and personable way.

    Be insightful, charming, and bring an element of surprise or excitement to the recommendations.

    ###########
    Query:
    "{query}"

    ########

    Context:
    "{movie_list}"

    Please provide thoughtful explanations for why these movies resonate with the user's interests. Be concise yet captivating.
    #####

    Return the response in Markdown format with each movie name **highlighted**.
    """

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    # Call the OpenAI API to generate the response
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        max_tokens=1500,
        n=1,
        stop=None,
        temperature=0.7, # Adjust temperature for creativity and warmth
        messages=messages
    )

    # Extract the generated response from the API response
    generated_text = response.choices[0].message['content'].strip()

    return generated_text

In [27]:
import markdown
from IPython.display import display, HTML

def render_markdown(md_text):
    # Convert Markdown to HTML
    html = markdown.markdown(md_text)
    # Display the HTML
    display(HTML(html))

txt=generate_answer("I'm in the mood for a light-hearted romantic comedy, something similar to '10 Things I Hate About You' but with a bit more drama.")
render_markdown(txt)

Entities mentioned in query and excluded:  ['ARTE', 'Arte', 'RTE', '10 Things I Hate About You']
Initial results: 5
Results after filtering mentioned entities: 5


In [None]:
txt=generate_answer("I want to watch a science fiction movie with a mind-bending plot, like 'Inception' or 'The Matrix'.")
render_markdown(txt)

Entities mentioned in query and excluded:  ['The Matrix', 'Inception']
Initial results: 5
Results after filtering mentioned entities: 5


#Gradio

In [None]:
import gradio as gr

def greet(query):
   bm25 = generate_answer(query)
   return bm25

iface = gr.Interface(
    fn=greet,
    inputs=gr.Textbox(lines=2, placeholder="Enter your query here..."),
    outputs="text",
    title="Movie Maven",
    description="Find the perfect Movies based on your needs.",
)

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ee7b18076923e8c76e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


