## Getting Movie Titles

In [2]:
from bs4 import BeautifulSoup

import requests, re

imdb_url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"

hdr = {'User-Agent': 'Mozilla/5.0'}

resp = requests.get(imdb_url, headers=hdr ).content

pattern = r'\d+\.'

soup = BeautifulSoup(resp, 'html.parser')

movies = soup.find_all('h3', class_='ipc-title__text')[1:-1]

top_rated = [re.sub(pattern, '', movie.text) for movie in movies]

titles = [title.strip() for title in top_rated]


## Getting IDs

In [3]:
def get_cast(cast_list):
    cast = []

    for member in cast_list:
        cast.append(
            {
                "name": member['name'].strip(),
                "role": member['character'].strip(),
                "department": member['known_for_department'].strip()
            }
        )

    return cast    

In [4]:
def get_providers(watch_providers):
    
    providers = []

    keys = watch_providers.keys()

    for key in keys:

        temp = watch_providers[key]

        if 'flatrate' in temp.keys():
            for provider in temp['flatrate']:
                providers.append(provider['provider_name'])

        elif 'buy' in temp.keys():
            for provider in temp['buy']:
                providers.append(provider['provider_name'])

        elif 'rent' in temp.keys():
            for provider in temp['rent']:
                providers.append(provider['provider_name'])

        else:
            pass                
    

    providers = [provider.strip() for provider in providers]

    providers = list(set(providers))

    substrings = ["prime", "max", "netflix", "hulu", "peacock", "youtube", "disney"]

    def contains_substring(element):
        return any(re.search(sub, element, re.IGNORECASE) for sub in substrings)


    providers = [element for element in providers if contains_substring(element)]

    providers = [provider for provider in providers if len(provider.split()) <= 3 ] 

    return providers   

In [5]:
def get_details(title, headers):

    base_url = "https://api.themoviedb.org/3/movie/"

    details = {}

    title_url = f"https://api.themoviedb.org/3/search/movie?query={title}&include_adult=true&language=en-US&page=1"

    resp = requests.get(title_url, headers=headers)

    if resp.status_code == 200:

        data = resp.json()

        if data['results']:
            movie_id = data['results'][0]['id']

            details['id'] = movie_id

            details_url = f"{base_url}{movie_id}?language=en-US"

            resp = requests.get(details_url, headers=headers)

            if resp.status_code == 200:
                data = resp.json()

                details['title'] = data['title']
                
                details['release_date'] = data['release_date']
                
                details['runtime'] = data['runtime']
                                
                details['rating'] = data['vote_average']
                
                details['overview'] = data['overview']
                
                details['tagline'] = data['tagline']

                details['genres'] = [genre['name'] for genre in data['genres']]

                details['imdb_id'] = data['imdb_id']

            cast_url = f"{base_url}{movie_id}/credits?language=en-US"

            resp = requests.get(cast_url, headers=headers)

            if resp.status_code == 200:
                data = resp.json()

                cast = data['cast']

                details['cast'] = get_cast(cast)

            keywords_url = f"{base_url}{movie_id}/keywords"

            resp = requests.get(keywords_url, headers=headers)

            if resp.status_code == 200:

                data = resp.json()

                details['keywords'] = [keyword['name'] for keyword in data['keywords']]    

            providers_url = f"{base_url}{movie_id}/watch/providers"

            resp = requests.get(providers_url, headers=headers)

            if resp.status_code == 200:

                data = resp.json()

                details['providers'] = get_providers(data['results'])  

            images_url = f"{base_url}{movie_id}/images"

            base_image_url = "https://image.tmdb.org/t/p/w500"

            resp = requests.get(images_url, headers=headers)

            if resp.status_code == 200:

                data = resp.json()

                keys = ['backdrops', 'logos', 'posters']

                details['images'] = []

                for key in keys:

                    if data[key]:

                        urls = [ f"{base_image_url}{image['file_path']}" for image in data[key]]

                        details['images'].extend(urls)
    return details    

In [6]:
from pymongo import MongoClient

from langchain_openai import OpenAIEmbeddings

from langchain_community.vectorstores import MongoDBAtlasVectorSearch

from langchain.text_splitter import RecursiveCharacterTextSplitter

import sys, os, certifi

from dotenv import load_dotenv

from tqdm import tqdm

load_dotenv(dotenv_path="../.env.local")

os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")


In [7]:
from langchain_community.document_loaders import WebBaseLoader

def get_docs(details):

    imdb_url = f"https://www.imdb.com/title/{details['imdb_id']}"

    tmdb_url = f"https://www.themoviedb.org/movie/{details['id']}"

    lbxd_name = details['title'].replace(' ', '-').lower()

    lbxd_url = f"https://letterboxd.com/film/{lbxd_name}/"

    loader = WebBaseLoader(
        [
            imdb_url,
            tmdb_url,
            lbxd_url
        ]
    )

    docs = loader.load()

    return docs



In [8]:
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIwYTBmYmYzYzRmZDdhZWVlMjZiNTc4MGUyOGU4YTdmZiIsInN1YiI6IjY2NmRkMWM5MjA2NGRmMzI3MGRmOTBiMCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.qWFPmYvwpJ4NgJkvlq-3P--69tDxaomyUvSkz8aXdZs"
}

In [8]:
url = "https://api.themoviedb.org/3/movie/top_rated?language=en-US&page=1"

response = requests.get(url, headers=headers)

data = response.json()

In [27]:
cnt = 500

titles = []

for idx in tqdm(range(1, cnt+1)):
    url = f"https://api.themoviedb.org/3/movie/top_rated?language=en-US&page={idx}"

    response = requests.get(url, headers=headers)

    data = response.json()

    titles.extend(
    [movie['title'] for movie in data['results'] if movie['original_language'] == 'en']
    )


  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [13:40<00:00,  1.64s/it]


In [28]:
len(titles)

7513

In [30]:
with open('movies.txt', 'w', encoding="utf-8") as f:
    for title in titles:
        f.write(f"{title}\n")

In [9]:
ca = certifi.where()

MONGO_URI = os.environ.get("MONGO_URI")

client = MongoClient(MONGO_URI, tlsCAFile=ca)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("\nPinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)


Pinged your deployment. You successfully connected to MongoDB!


In [11]:
os.environ.get("DETAILS_DB")

In [14]:
movie_db = client[os.getenv("CINELENS_DB")]

movie_coll = movie_db['the-shawshank-redemption-details']

In [13]:
movie_coll.find_one()['images']

TypeError: 'NoneType' object is not subscriptable

In [10]:
movie_db = client[os.environ.get("DETAILS_DB")]

movie_coll = movie_db['the-shawshank-redemption-details']

movie_coll.insert_one(get_details(titles[0], headers))

InsertOneResult(ObjectId('666fab0230ad78c122ab886e'), acknowledged=True)

In [10]:
docs = get_docs(get_details(titles[0], headers))

In [12]:
vs_db = client[os.environ.get("VS_DB")]

vs_coll = vs_db['the-shawshank-redemption-vs']

text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, 
                                                chunk_overlap=100)
chunks = text_splitter.split_documents(docs)

In [13]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get("OPENAI_API_KEY"), 
                            disallowed_special=())

# Create embeddings in atlas vector store
vector_search = MongoDBAtlasVectorSearch.from_documents( 
                                documents=chunks, 
                                embedding= embeddings, 
                                collection=vs_coll,)

In [11]:
vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    os.environ.get("MONGO_URI"),
    os.environ.get("VS_DB") + "." + f"the-shawshank-redemption-vs",
    OpenAIEmbeddings(openai_api_key=os.environ.get("OPENAI_API_KEY"), 
                            disallowed_special=()),
)

In [12]:
retriever = vector_search.as_retriever(
            search_type = "similarity",
            search_kwargs = {"k": 3}
            )

In [13]:
from langchain_openai.chat_models import ChatOpenAI

from langchain_core.prompts import PromptTemplate

from langchain_core.runnables import RunnablePassthrough

from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

In [14]:
template = """Answer the question: {question} based only on the following context:
context: {context}
"""

output_parser = JsonOutputParser()

prompt = PromptTemplate.from_template(template = template,
                    input_varaibles = ["context", "question"],
                    output_variables = ["answer"],)

output_parser = StrOutputParser()

In [15]:
model = ChatOpenAI(api_key="sk-no-key-required", 
            model_name = 'LLaMA_CPP',
            base_url="http://127.0.0.1:8080/v1",
            temperature=0.3)

In [16]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

retrieval_chain = (
    {"context": retriever | format_docs,  "question": RunnablePassthrough()}
    | prompt 
    | model 
    | output_parser
)

In [17]:
query = f"""
Which movie is being described with the following details: 

Plot: A successful banker is wrongfully convicted of murdering his wife and her lover and is sentenced to life imprisonment in a harsh penitentiary. 
Over the decades, he forms an unlikely friendship with a fellow inmate, a seasoned contraband smuggler. 
Despite the brutality and corruption within the prison walls, he maintains hope and dignity, using his financial skills to assist 
the prison staff, ultimately orchestrating a daring and ingenious escape plan. The narrative explores themes of hope, resilience,
and the enduring human spirit in the face of adversity.

Genre: Drama

Cast: Morgan Freeman

Keywords: prison, hope

"""

response = retrieval_chain.invoke(query)

In [18]:
response

'The movie described in the given text is "The Shawshank Redemption" (1994), a drama film directed by Frank Darabont. The plot revolves around a successful banker named Andy Dufresne, who is wrongfully convicted of murdering his wife and her lover and sentenced to life imprisonment in a harsh penitentiary. The movie explores themes of hope, resilience, and the enduring human spirit in the face of adversity. The film features Morgan Freeman in the lead role and explores the themes of prison, hope, and the enduring human spirit. The movie is a classic and has won numerous awards, including an Academy Award for Best Picture.</s>'