In [1]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3.1:8b",
    temperature=0.9,
    keep_alive="20m"
)

In [2]:
def read_prompt(prompt_path : str) -> str:
    with open(prompt_path, 'r', encoding='utf-8') as file:
        prompt = file.read()
    return prompt

In [4]:

from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate

def create_prompt_chain(prompt_path, llm):
    prompt_text = read_prompt(prompt_path)
    prompt_template = ChatPromptTemplate.from_messages(["system", prompt_text])
    return prompt_template | llm

actor_chain = create_prompt_chain("prompt/gen_question/actor_prompt.md", llm)
director_chain = create_prompt_chain("prompt/gen_question/director_prompt.md", llm)
categories_chain = create_prompt_chain("prompt/gen_question/categories_prompt.md", llm)
description_chain = create_prompt_chain("prompt/gen_question/description_prompt.md", llm)
countries_chain = create_prompt_chain("prompt/gen_question/countries_prompt.md", llm)


In [5]:
import pandas as pd

ml1m_data = pd.read_csv("movies_data.csv")

In [6]:
ml1m_data.head(1)

Unnamed: 0,title,category,plot,actors,director,country
0,4K Lãng Quên - Oblivion,"['T16', 'Phim Âu Mỹ', 'Hành động & Phiêu lưu',...",,"['Morgan Freeman', '', 'Olga Kurylenko']",['Joseph Kosinski'],['United States of America']


In [12]:
new_df = pd.DataFrame(columns=["film_info", "q_actor", "q_director", "q_genres", "q_country", "q_plot"])

In [13]:
from tqdm import tqdm
import itertools

# Limit to 200 rows and add tqdm progress bar
for index, row in tqdm(itertools.islice(ml1m_data.iterrows(), 200), total=200, desc="Processing films"):
    film_info = f"""
    Tên phim : {row["title"]}
    Thể loại : {row["category"]}
    Diễn viên : {row["actors"]}
    Đạo diễn : {row["director"]}
    Quốc gia : {row["country"]}
    Mô tả : {row["plot"]}
    """
    
    # Generate questions
    question_actor = actor_chain.invoke({"film_information": film_info})
    question_director = director_chain.invoke({"film_information": film_info})
    question_categories = categories_chain.invoke({"film_information": film_info})
    question_description = description_chain.invoke({"film_information": film_info})
    question_countries = countries_chain.invoke({"film_information": film_info})
    
    # Store in new DataFrame
    new_df.loc[index] = [
        film_info,
        question_actor.content,
        question_director.content,
        question_categories.content,
        question_countries.content,
        question_description.content
    ]


Processing films:  84%|████████▍ | 168/200 [08:11<01:33,  2.93s/it]


In [15]:
new_df.to_csv("question_gen_tv360.csv", index=False)