# Implementing an LLM-powered recommendation system

In [71]:
import os
os.chdir('/Users/bytedance/Documents/GitHub/Building-LLM-powered-Solutions/')


## Data Preprocessing

In [72]:
# 导入 pandas 库，用于数据分析和处理，并将其简称为 pd
import pandas as pd

# 使用 read_csv 函数从名为 'movies_metadata.csv' 的 CSV 文件中读取数据，
# 并将数据存储在名为 md 的 DataFrame 对象中
md = pd.read_csv('movies_metadata.csv')

# 使用 head() 方法显示 DataFrame 的前 5 行数据
md.head()

  md = pd.read_csv('movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [73]:
import ast  # 导入 ast 库，用于处理 Python 抽象语法树

# 将字符串表示的字典转换为实际的字典
md['genres'] = md['genres'].apply(ast.literal_eval)

# 对 'genres' 列进行转换
md['genres'] = md['genres'].apply(lambda x: [genre['name'] for genre in x])

md.head()


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [74]:
# 定义一个函数，用于计算加权评分（IMDb 公式）
def calculate_weighted_rate(vote_average, vote_count, min_vote_count=10):
    """
    计算加权评分。

    参数：
        vote_average: 平均评分。
        vote_count: 评分人数。
        min_vote_count: 最低评分人数，用于防止评分人数过少导致结果偏差。

    返回值：
        加权评分。
    """
    return (vote_count / (vote_count + min_vote_count)) * vote_average + (min_vote_count / (vote_count + min_vote_count)) * 5.0

# 获取非空评分人数，并转换为整数类型
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')

# 计算评分人数的 95% 分位数，作为最低评分人数
min_vote_count = vote_counts.quantile(0.95)

# 创建一个新的列 'weighted_rate'，用于存储加权评分
md['weighted_rate'] = md.apply(lambda row: calculate_weighted_rate(row['vote_average'], row['vote_count'], min_vote_count), axis=1)

md.head()


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,weighted_rate
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,7.499658
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,6.610362
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,5.262357
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,5.079915
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,5.199506


In [75]:
# 删除 DataFrame 中包含缺失值的行
md = md.dropna()

In [76]:
# 创建一个新的 DataFrame，只保留指定的列，并重置索引
md_final = md[['genres', 'title', 'overview', 'weighted_rate']].reset_index(drop=True)

# 显示新的 DataFrame 的前 5 行
md_final.head()

Unnamed: 0,genres,title,overview,weighted_rate
0,"[Adventure, Action, Thriller]",GoldenEye,James Bond must unmask the mysterious head of ...,6.173464
1,[Comedy],Friday,Craig and Smokey are two guys in Los Angeles h...,6.083421
2,"[Horror, Action, Thriller, Crime]",From Dusk Till Dawn,Seth Gecko and his younger brother Richard are...,6.503176
3,[Comedy],Blue in the Face,"Auggie runs a small tobacco shop in Brooklyn, ...",5.109091
4,"[Action, Adventure, Science Fiction, Family, F...",Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,5.052129


In [77]:
# 创建一个新的列 'combined_info'，将 'title', 'overview' 和 'genre' 合并
md_final['combined_info'] = md_final.apply(lambda row: f"Title: {row['title']}. Overview: {row['overview']} Genres: {', '.join(row['genres'])}. Rating: {row['weighted_rate']}", axis=1)

# 显示 'combined_info' 列的第 10 个元素
md_final['combined_info'][9]

'Title: Jurassic Park. Overview: A wealthy entrepreneur secretly creates a theme park featuring living dinosaurs drawn from prehistoric DNA. Before opening day, he invites a team of experts and his two eager grandchildren to experience the park and help calm anxious investors. However, the park is anything but amusing as the security systems go off-line and the dinosaurs escape. Genres: Adventure, Science Fiction. Rating: 7.39064935064935'

## Embeddings

In [17]:
# import pandas as pd
# import tiktoken

# # embedding model parameters
# embedding_model = "text-embedding-ada-002"
# embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
# max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

# encoding = tiktoken.get_encoding(embedding_encoding)

# # omit reviews that are too long to embed
# md_final["n_tokens"] = md_final.combined_info.apply(lambda x: len(encoding.encode(x)))
# md_final = md_final[md_final.n_tokens <= max_tokens]
# len(md_final)

693

In [80]:
md_final.head()

len(md_final)

693

In [79]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_API_KEY"]

# 使用 Google 的 embedding-001 模型创建嵌入模型
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

md_final["embedding"] = md_final.overview.apply(lambda x: embeddings_model.embed_query(x))
md_final.head()

Unnamed: 0,genres,title,overview,weighted_rate,combined_info,embedding
0,"[Adventure, Action, Thriller]",GoldenEye,James Bond must unmask the mysterious head of ...,6.173464,Title: GoldenEye. Overview: James Bond must un...,"[0.038663853, 0.0075680837, -0.034377582, -0.0..."
1,[Comedy],Friday,Craig and Smokey are two guys in Los Angeles h...,6.083421,Title: Friday. Overview: Craig and Smokey are ...,"[0.049222328, 0.020888079, -0.07884456, -0.053..."
2,"[Horror, Action, Thriller, Crime]",From Dusk Till Dawn,Seth Gecko and his younger brother Richard are...,6.503176,Title: From Dusk Till Dawn. Overview: Seth Gec...,"[0.0686446, 0.020156348, -0.07018096, -0.03675..."
3,[Comedy],Blue in the Face,"Auggie runs a small tobacco shop in Brooklyn, ...",5.109091,Title: Blue in the Face. Overview: Auggie runs...,"[0.011032898, 0.0018526185, -0.03041921, -0.02..."
4,"[Action, Adventure, Science Fiction, Family, F...",Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,5.052129,Title: Mighty Morphin Power Rangers: The Movie...,"[-0.013818871, 0.00094466266, -0.061664958, -0..."


In [81]:
md_final.rename(columns = {'embedding': 'vector'}, inplace = True)
md_final.rename(columns = {'combined_info': 'text'}, inplace = True)
md_final.to_pickle('movies.pkl')

## Start working with LLMs

In [82]:
from langchain.vectorstores import LanceDB

In [83]:
import pandas as pd

md = pd.read_pickle('movies.pkl')

md.head(2)

Unnamed: 0,genres,title,overview,weighted_rate,text,vector
0,"[Adventure, Action, Thriller]",GoldenEye,James Bond must unmask the mysterious head of ...,6.173464,Title: GoldenEye. Overview: James Bond must un...,"[0.038663853, 0.0075680837, -0.034377582, -0.0..."
1,[Comedy],Friday,Craig and Smokey are two guys in Los Angeles h...,6.083421,Title: Friday. Overview: Craig and Smokey are ...,"[0.049222328, 0.020888079, -0.07884456, -0.053..."


In [84]:
md['text'][0]

'Title: GoldenEye. Overview: James Bond must unmask the mysterious head of the Janus Syndicate and prevent the leader from utilizing the GoldenEye weapons system to inflict devastating revenge on Britain. Genres: Adventure, Action, Thriller. Rating: 6.173464373464373'

In [85]:
import lancedb

uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table("movies", md)

In [86]:
docsearch = LanceDB(connection = table, embedding = embeddings_model)

In [51]:
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import LanceDB
from langchain.chains import RetrievalQA

docsearch = LanceDB(connection = table, embedding = embeddings_model)


In [103]:
# 定义查询语句
query = "I'm looking for a comedy"

# 使用相似性搜索查找相关文档
docs = docsearch.similarity_search(query)

# 打印找到的文档
# print(docs)

# 打印第一个文档的内容
print(docs[0].page_content)

Title: Life Eternal. Overview: A thriller crime comedy directed by Wolfgang Murnberger. Genres: Comedy, Crime, Thriller. Rating: 5.131533477321813


In [102]:
from langchain_google_genai import GoogleGenerativeAI
# 初始化聊天型语言模型，使用 Google 的 gemini-1.5-pro 模型
llm = GoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,          # 控制生成文本的随机性，0 表示最确定性
    max_tokens=None,       # 生成的最大 token 数量，None 表示没有限制
    timeout=None,          # 请求超时时间，None 表示没有限制
    max_retries=2,         # 最大重试次数
    # 其他参数...
)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)

query = "I'm looking for a movie with a police detective and an adventurous plot."
result = qa({"query": query})
result['result']

'Hitman: Agent 47 has an adventurous plot, but doesn\'t feature a police detective.  Basic Instinct has a police detective, but it\'s hard to say if the plot is adventurous from the short description given.  The other two options don\'t match your criteria.  Could you give me more information about what you mean by "adventurous"? \n'

In [105]:
result['source_documents'][2].page_content

'Title: Hitman: Agent 47. Overview: An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry. Genres: Action, Crime, Thriller. Rating: 5.365800865800866'

In [106]:
# 从电影数据中筛选出喜剧类电影
df_filtered = md[md['genres'].apply(lambda x: 'Comedy' in x)]

# 创建问答系统
qa = RetrievalQA.from_chain_type(
    llm=llm,                             # 使用的语言模型
    chain_type="stuff",                 # 使用 "stuff" 链类型
    retriever=docsearch.as_retriever(  # 使用筛选后的数据创建检索器
        search_kwargs={'data': df_filtered}
    ), 
    return_source_documents=True        # 设置返回源文档
)

# 定义查询语句
query = "I'm looking for a movie with a police detective and an adventurous plot."

# 使用问答系统进行查询
result = qa({"query": query})

# 打印查询结果
result['source_documents'][2].page_content

'Title: Hitman: Agent 47. Overview: An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry. Genres: Action, Crime, Thriller. Rating: 5.365800865800866'

In [118]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'filter': {'adult':'False'}}), return_source_documents=True)

query = "I'm looking for a movie with a police detective and an adventurous plot."
result = qa({"query": query})
result['source_documents'][2].page_content

'Title: Hitman: Agent 47. Overview: An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry. Genres: Action, Crime, Thriller. Rating: 5.365800865800866'

In [116]:
result['source_documents'][3].page_content

'Title: We Are From The Future. Overview: Four 21st century treasure seekers are transported back into the middle of a WWII battles in Soviet Union... Genres: War, Fantasy, Action, Drama. Rating: 5.071120689655172'

### Agent

In [None]:
# import os
# os.environ["GOOGLE_API_KEY"]

# from langchain_google_genai import GoogleGenerativeAI
# # 初始化语言模型，使用 Google 的 gemini-1.5-pro 模型，并将 temperature 设置为 0
# llm = GoogleGenerativeAI(model="gemini-1.5-pro", temperature=0)

# from langchain import SerpAPIWrapper
# from langchain.agents import AgentType, initialize_agent
# from langchain.tools import BaseTool, StructuredTool, Tool, tool

# from dotenv import load_dotenv
# load_dotenv()

# os.environ["SERPAPI_API_KEY"]

# search = SerpAPIWrapper()

In [45]:
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.schema.messages import SystemMessage
from langchain.prompts import MessagesPlaceholder
from langchain.agents.openai_functions_agent.agent_token_buffer_memory import AgentTokenBufferMemory



system_message = SystemMessage(
        content=(
            "Do your best to answer the questions. "
            "if there are more than one argument for the single-input tool, reason step by step and treat them as single input. "
            "relevant information, only if neccessary"
        )
)

# This is needed for both the memory and the prompt
memory_key = "history"

memory = AgentTokenBufferMemory(memory_key=memory_key, llm=llm)

prompt = OpenAIFunctionsAgent.create_prompt(
        system_message=system_message,
        extra_prompt_messages=[MessagesPlaceholder(variable_name=memory_key)]
    )
agent_executor = create_conversational_retrieval_agent(llm=llm, tools=tools, prompt = prompt, verbose=True)

result = agent_executor({"input": "I liked a lot kung fu panda 1 and 2. Could you suggest me some similar movies?"})
result



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `movies` with `{'title': 'Kung Fu Panda'}`


[0m[36;1m[1;3m[Document(page_content='When the Valley of Peace is threatened, lazy Po the panda discovers his destiny as the "chosen one" and trains to become a kung fu hero, but transforming the unsleek slacker into a brave warrior won\'t be easy. It\'s up to Master Shifu and the Furious Five -- Tigress, Crane, Mantis, Viper and Monkey -- to give it a try.', metadata={'adult': 'False', 'genres': array(['Adventure', 'Animation', 'Family', 'Comedy'], dtype=object), 'title': 'Kung Fu Panda', 'weighted_rate': 6.675006821282402, 'n_tokens': 79, 'vector': array([-0.00345203, -0.03163854,  0.00223724, ...,  0.00216131,
        0.00014942, -0.01268781], dtype=float32), '_distance': 0.20448510348796844}), Document(page_content='Po is now living his dream as The Dragon Warrior, protecting the Valley of Peace alongside his friends and fellow kung fu masters, The Furious Five - 

{'input': 'I liked a lot kung fu panda 1 and 2. Could you suggest me some similar movies?',
 'chat_history': [HumanMessage(content='I liked a lot kung fu panda 1 and 2. Could you suggest me some similar movies?', additional_kwargs={}, example=False),
  AIMessage(content='', additional_kwargs={'function_call': {'name': 'movies', 'arguments': '{\n  "title": "Kung Fu Panda"\n}'}}, example=False),
  FunctionMessage(content='[Document(page_content=\'When the Valley of Peace is threatened, lazy Po the panda discovers his destiny as the "chosen one" and trains to become a kung fu hero, but transforming the unsleek slacker into a brave warrior won\\\'t be easy. It\\\'s up to Master Shifu and the Furious Five -- Tigress, Crane, Mantis, Viper and Monkey -- to give it a try.\', metadata={\'adult\': \'False\', \'genres\': array([\'Adventure\', \'Animation\', \'Family\', \'Comedy\'], dtype=object), \'title\': \'Kung Fu Panda\', \'weighted_rate\': 6.675006821282402, \'n_tokens\': 79, \'vector\': arr

## Prompt engineering

In [255]:
from langchain.prompts import PromptTemplate

template = """You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
For each question, suggest three movies, with a short description of the plot and the reason why the user migth like it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Your response:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for a funny action movie, any suggestion?"
result = qa({'query':query})
print(result['result'])


 
1. A Good Day to Die Hard: An action-packed comedy directed by John Moore, this movie follows Iconoclastic, take-no-prisoners cop John McClane as he travels to Moscow to help his wayward son Jack. With the Russian underworld in pursuit, and battling a countdown to war, the two McClanes discover that their opposing methods make them unstoppable heroes.
2. The Hidden: An alien is on the run in America and uses the bodies of anyone in its way as a hiding place. With lots of innocent people dying in the chase, this action-packed horror movie is sure to keep you laughing.
3. District B13: Set in the ghettos of Paris in 2010, this action-packed science fiction movie follows an undercover cop and ex-thug as they try to infiltrate a gang in order to defuse a neutron bomb. A thrilling comedy that will keep you laughing.


In [256]:
from langchain.prompts import PromptTemplate

template_prefix = """You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix= """Question: {question}
Your response:"""

user_info = user_info.format(age = 18, gender = 'female')

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)


You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 18
Gender: female
Question: {question}
Your response:


In [257]:
PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "Can you suggest me some action movie?"
result = qa({'query':query})
result['result']


' Sure, I can suggest some action movies for you. Here are a few examples: A Good Day to Die Hard, Goldfinger, Ong Bak 2, and The Raid 2. All of these movies have high ratings and feature thrilling action elements. I hope you find something that you enjoy!'

In [258]:
result['source_documents']

[Document(page_content='Title: A Good Day to Die Hard. Overview: Iconoclastic, take-no-prisoners cop John McClane, finds himself for the first time on foreign soil after traveling to Moscow to help his wayward son Jack - unaware that Jack is really a highly-trained CIA operative out to stop a nuclear weapons heist. With the Russian underworld in pursuit, and battling a countdown to war, the two McClanes discover that their opposing methods make them unstoppable heroes. Genres: Action, Thriller. Rating: 5.178041993422717', metadata={'genres': array(['Action', 'Thriller'], dtype=object), 'title': 'A Good Day to Die Hard', 'overview': 'Iconoclastic, take-no-prisoners cop John McClane, finds himself for the first time on foreign soil after traveling to Moscow to help his wayward son Jack - unaware that Jack is really a highly-trained CIA operative out to stop a nuclear weapons heist. With the Russian underworld in pursuit, and battling a countdown to war, the two McClanes discover that the

## Content based

In [260]:
import pandas as pd

data = {
    "username": ["Alice", "Bob"],
    "age": [25, 32],
    "gender": ["F", "M"],
    "movies": [
        [("Transformers: The Last Knight", 7), ("Pokémon: Spell of the Unknown", 5)],
        [("Bon Cop Bad Cop 2", 8), ("Goon: Last of the Enforcers", 9)]
    ]
}

# Convert the "movies" column into dictionaries
for i, row_movies in enumerate(data["movies"]):
    movie_dict = {}
    for movie, rating in row_movies:
        movie_dict[movie] = rating
    data["movies"][i] = movie_dict

# Create a pandas DataFrame
df = pd.DataFrame(data)

df.head()


Unnamed: 0,username,age,gender,movies
0,Alice,25,F,"{'Transformers: The Last Knight': 7, 'Pokémon:..."
1,Bob,32,M,"{'Bon Cop Bad Cop 2': 8, 'Goon: Last of the En..."


In [261]:
template_prefix = """You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}
Movies already seen alongside with rating: {movies}"""

template_suffix= """Question: {question}
Your response:"""

In [262]:
age = df.loc[df['username']=='Alice']['age'][0]
gender = df.loc[df['username']=='Alice']['gender'][0]

movies = ''
# Iterate over the dictionary and output movie name and rating
for movie, rating in df['movies'][0].items():
    output_string = f"Movie: {movie}, Rating: {rating}" + "\n"
    movies+=output_string
    #print(output_string)
user_info = user_info.format(age = age, gender = gender, movies = movies)

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 25
Gender: F
Movies already seen alongside with rating: Movie: Transformers: The Last Knight, Rating: 7
Movie: Pokémon: Spell of the Unknown, Rating: 5

Question: {question}
Your response:


In [263]:
PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "Can you suggest me some action movie based on my background?"
result = qa({'query':query})
result['result']

" Based on your age, gender, and the movies you've already seen, I would suggest the following action movies: The Raid 2 (Action, Crime, Thriller; Rating: 6.71), Ong Bak 2 (Adventure, Action, Thriller; Rating: 5.24), Hitman: Agent 47 (Action, Crime, Thriller; Rating: 5.37), and Kingsman: The Secret Service (Crime, Comedy, Action, Adventure; Rating: 7.43)."

In [264]:
result['source_documents']

[Document(page_content='Title: The Raid 2. Overview: After fighting his way through an apartment building populated by an army of dangerous criminals and escaping with his life, SWAT team member Rama goes undercover, joining a powerful Indonesian crime syndicate to protect his family and uncover corrupt members of his own force. Genres: Action, Crime, Thriller. Rating: 6.7086887835703', metadata={'genres': array(['Action', 'Crime', 'Thriller'], dtype=object), 'title': 'The Raid 2', 'overview': 'After fighting his way through an apartment building populated by an army of dangerous criminals and escaping with his life, SWAT team member Rama goes undercover, joining a powerful Indonesian crime syndicate to protect his family and uncover corrupt members of his own force.', 'weighted_rate': 6.7086887835703, 'n_tokens': 78, 'vector': array([ 0.00455619, -0.02637059, -0.0112055 , ..., -0.02318425,
        -0.00706593,  0.01398521], dtype=float32), '_distance': 0.4141116142272949}),
 Document(

In [None]:
# imports
import pandas as pd
import tiktoken
import os
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]

from openai.embeddings_utils import get_embedding