In [33]:
import os
import json
import requests
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama

# === Load API keys ===
load_dotenv()
OMDB_KEY = '2774b611'  # Replace if needed
llm = Ollama(model="llama3")

# === Functions ===
def fetch_omdb_data(title: str) -> dict:
    url = f"http://www.omdbapi.com/?t={title}&apikey={OMDB_KEY}"
    res = requests.get(url)
    if res.status_code == 200:
        data = res.json()
        if data.get("Response") == "True":
            return data
    return {"Error": f"No data found for {title}"}

def generate_tags(plot_and_genre: str) -> str:
    prompt = PromptTemplate.from_template("""
Generate 3–5 lowercase tags (no hashtags or punctuation), separated by '|'.
Do not explain or include extra text.

Plot and Genre:
{plot_and_genre}

Tags:
""")
    _input = prompt.format(plot_and_genre=plot_and_genre)
    result = llm.predict(_input).strip()
    lines = result.splitlines()
    tags = lines[-1].strip()
    if ":" in tags:
        tags = tags.split(":")[-1].strip()
    return tags.replace('"', '').strip('.')

# === Read movie titles ===
with open("movies.txt", "r", encoding="utf-8") as f:
    movie_titles = [line.strip() for line in f if line.strip()]

# === Output file (JSONL) ===
output_file = "movies.jsonl"

with open(output_file, "w", encoding="utf-8") as outfile:
    for title in movie_titles:
        print(f"🎬 Processing: {title}")
        movie_data = fetch_omdb_data(title)
        if "Error" in movie_data:
            print(f"❌ Skipped {title}: {movie_data['Error']}")
            continue

        plot = movie_data.get("Plot", "")
        genre = movie_data.get("Genre", "").split(",")[0].strip()
        tag_input = f"{plot} Genre: {genre}"
        tags = generate_tags(tag_input)

        json_record = {
            "title": movie_data.get("Title", ""),
            "plot": plot,
            "director": movie_data.get("Director", ""),
            "genre": genre,
            "rating": movie_data.get("imdbRating", ""),
            "release_date": movie_data.get("Released", ""),
            "language": "|".join(lang.strip() for lang in movie_data.get("Language", "").split(",")),
            "country": "|".join(c.strip() for c in movie_data.get("Country", "").split(",")),
            "cast": "|".join(actor.strip() for actor in movie_data.get("Actors", "").split(",")),
            "tags": tags
        }

        outfile.write(json.dumps(json_record, ensure_ascii=False) + "\n")

print(f"✅ Data written to {output_file}")


🎬 Processing: Ala Vaikunthapurramuloo
🎬 Processing: Arjun Reddy
🎬 Processing: F2: Fun and Frustration
🎬 Processing: Maharshi
🎬 Processing: Bheeshma
🎬 Processing: Saaho
🎬 Processing: Kabir Singh
🎬 Processing: Geetha Govindam
🎬 Processing: Mahanati
🎬 Processing: Rangasthalam
🎬 Processing: Sye Raa Narasimha Reddy
🎬 Processing: Nenu Sailaja
🎬 Processing: Pelli Choopulu
🎬 Processing: Dookudu
🎬 Processing: Mirchi
🎬 Processing: Businessman
🎬 Processing: Autonagar Surya
🎬 Processing: Premam
🎬 Processing: Gentleman
🎬 Processing: Kalyana Vaibhogame
🎬 Processing: Srimanthudu
🎬 Processing: Attarintiki Daredi
🎬 Processing: Baahubali 2: The Conclusion
🎬 Processing: Baahubali
🎬 Processing: Ye Maaya Chesave
🎬 Processing: Eega
🎬 Processing: Dilwala
🎬 Processing: Brahma
🎬 Processing: Maheshinte Prathikaaram
🎬 Processing: Aagadu
🎬 Processing: Jai Lava Kusa
🎬 Processing: Taxiwala
🎬 Processing: RX100
🎬 Processing: Kabali
🎬 Processing: Janatha Garage
🎬 Processing: Gaddalakonda Ganesh
🎬 Processing: A..Aa
🎬 P

In [None]:
import os
import requests
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama
from dotenv import load_dotenv

load_dotenv()
OMDB_KEY = '2774b611'  # or os.getenv("OMDB_KEY")
llm = Ollama(model="llama3")

# === Function to fetch basic OMDb validation ===
def omdb_title_exists(title: str) -> bool:
    url = f"http://www.omdbapi.com/?t={title}&apikey={OMDB_KEY}"
    res = requests.get(url)
    data = res.json()
    return data.get("Response") == "True"

# === Agent to generate titles ===
def generate_movie_titles(user_prompt: str, count: int) -> list:
    prompt = PromptTemplate.from_template("""
Generate a list of {count} real movie titles based on this request:

"{user_prompt}"

Only list the movie titles, each on a new line. Do not include extra descriptions or numbering.
""")
    formatted_prompt = prompt.format(user_prompt=user_prompt, count=count)
    raw_output = llm.predict(formatted_prompt)

    # Clean and split titles
    titles = [line.strip().strip('"') for line in raw_output.splitlines() if line.strip()]
    return titles

# === Main method ===
def populate_movies_txt(user_prompt: str, count: int, output_file="movies.txt"):
    print(f"🎬 Generating movie titles for: {user_prompt}")
    titles = generate_movie_titles(user_prompt, count)

    # Load existing titles
    existing_titles = set()
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            existing_titles = set(line.strip() for line in f if line.strip())

    print(f"🔍 Validating titles via OMDb...")
    valid_titles = []
    for title in titles:
        if title in existing_titles:
            print(f"⏩ Skipping duplicate: {title}")
            continue
        if omdb_title_exists(title):
            valid_titles.append(title)
            existing_titles.add(title)  # Add to avoid future duplication
        if len(valid_titles) >= count:
            break

    if not valid_titles:
        print("⚠️ No new valid titles found.")
        return

    with open(output_file, "a", encoding="utf-8") as f:
        for title in valid_titles:
            f.write(title + "\n")

    print(f"✅ {len(valid_titles)} new movie titles appended to {output_file}")

# === Example usage ===
if __name__ == "__main__":
    prompt = "Top Hollywood movies, recent ones have higher priority, but maintain some old classical movies as well"
    populate_movies_txt(prompt, count=5000)  # Start with 100 during testing


🎬 Generating movie titles for: Top Tollywood (Telugu) movies, recent ones have higher priority
🔍 Validating titles via OMDb...
⏩ Skipping duplicate: Ala Vaikunthapurramuloo
⏩ Skipping duplicate: Arjun Reddy
⏩ Skipping duplicate: Maharshi
⏩ Skipping duplicate: Rangasthalam
⏩ Skipping duplicate: Nenu Sailaja
⏩ Skipping duplicate: Geetha Govindam
⏩ Skipping duplicate: F2: Fun and Frustration
⏩ Skipping duplicate: Mahanati
⏩ Skipping duplicate: Aagadu
⏩ Skipping duplicate: Srimanthudu
⏩ Skipping duplicate: Gentleman
⏩ Skipping duplicate: Maheshinte Prathikaaram
⏩ Skipping duplicate: Sarrainodu
⏩ Skipping duplicate: Eega
⏩ Skipping duplicate: Aagadu
⏩ Skipping duplicate: Nenu Sailaja
⏩ Skipping duplicate: Mahanati
⏩ Skipping duplicate: Arjun Reddy
⏩ Skipping duplicate: Maharshi
⏩ Skipping duplicate: Srimanthudu
⏩ Skipping duplicate: Khaidi No.150
⏩ Skipping duplicate: Okkadu
⏩ Skipping duplicate: Gharshana
⏩ Skipping duplicate: Aadukalam
⏩ Skipping duplicate: Jabardasth
⏩ Skipping duplicate

In [27]:
import requests
from bs4 import BeautifulSoup

def fetch_imdb_list_titles(url, max_count=500):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    titles = []

    for item in soup.select(".lister-item-header a"):
        if len(titles) >= max_count:
            break
        titles.append(item.text.strip())

    return titles

def save_to_movies_txt(titles, filename="movies.txt"):
    with open(filename, "w", encoding="utf-8") as f:
        for title in titles:
            f.write(title + "\n")
    print(f"✅ Saved {len(titles)} movies to {filename}")


# Example usage
url = "https://www.imdb.com/india/top-rated-telugu-movies/"  # Replace with your chosen IMDb list
titles = fetch_imdb_list_titles(url)
save_to_movies_txt(titles)


✅ Saved 0 movies to movies.txt


In [1]:
%pip install langchain openai requests python-dotenv

Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl (1.0 MB)
     ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
     --------- ------------------------------ 0.2/1.0 MB 7.3 MB/s eta 0:00:01
     ---------------- ----------------------- 0.4/1.0 MB 6.8 MB/s eta 0:00:01
     ---------------------------- ----------- 0.7/1.0 MB 6.6 MB/s eta 0:00:01
     ---------------------------------------  1.0/1.0 MB 7.0 MB/s eta 0:00:01
     ---------------------------------------- 1.0/1.0 MB 6.4 MB/s eta 0:00:00
Collecting openai
  Downloading openai-1.93.0-py3-none-any.whl (755 kB)
     ---------------------------------------- 0.0/755.0 kB ? eta -:--:--
     ------------ ------------------------- 256.0/755.0 kB 7.9 MB/s eta 0:00:01
     ----------------------- -------------- 471.0/755.0 kB 7.3 MB/s eta 0:00:01
     -------------------------------------  747.5/755.0 kB 6.7 MB/s eta 0:00:01
     -------------------------------------- 755.0/755.0 kB 6.0 MB/s eta 0:00


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
%pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
     ---------------------------------------- 0.0/187.3 kB ? eta -:--:--
     -------------------------------------- 187.3/187.3 kB 3.8 MB/s eta 0:00:00
Collecting soupsieve>1.2
  Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.13.4 bs4-0.0.2 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
