# LLM 기반 추천 시스템 구현

시뮬레이션 할 시나리오: 사용자의 선호도 기록이 없는 상태에서 사용자가 추천 시스템과 처음 상호 작용하는 '콜드 스타트' 시니리오

In [1]:
# 영화 데이터베이스 활용 예정
# https://www.kaggle.com/datasets/rohan4050/movie-recommendation-data

## 데이터 전처리

In [2]:
import pandas as pd

md = pd. read_csv('movies_metadata.csv')
md.head()

  md = pd. read_csv('movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
# 관심을 두는 열은 Genres, Title, Overview, Vote_average, Vote_count

# 먼저, 데이터셋의 원래 딕셔너리 형식보다 처리하기 쉬운 numpy 배열로 genres 열의 형식을 지정
import ast

# 문자열로 표현된 딕셔너리를 실제 딕셔너리로 변환
md['genres'] = md['genres'].apply(ast.literal_eval)

# 'genres' 열 변환
md['genres'] = md['genres'].apply(lambda x: [genre['name'] for genre in x])


md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
# 다음으로, 투표 수에 가중치를 부여한 vote_average 와 vote_count 열을 단일 열로 병합

# 가중 평점 계산 (IMDb 공식)
def calculate_weighted_rate(vote_average, vote_count, min_vote_count=10):
    return (vote_count / (vote_count + min_vote_count)) * vote_average + (min_vote_count / (vote_count + min_vote_count)) * 5.0

# 왜곡된 결과를 방지하기 위한 최소 투표 수
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
min_vote_count = vote_counts.quantile(0.95)

# 새로운 'weighted_rate' 열을 만듦
md['weighted_rate'] = md.apply(lambda row: calculate_weighted_rate(row['vote_average'], row['vote_count'], min_vote_count), axis=1)
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,weighted_rate
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,7.499658
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,6.610362
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,5.262357
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,5.079915
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,5.199506


In [5]:
# 비정상 데이터 제거
md = md.dropna()

In [6]:
# 필요한 열만 추출
md_final = md[['genres', 'title', 'overview', 'weighted_rate']].reset_index(drop=True)
md_final.head()

Unnamed: 0,genres,title,overview,weighted_rate
0,"[Adventure, Action, Thriller]",GoldenEye,James Bond must unmask the mysterious head of ...,6.173464
1,[Comedy],Friday,Craig and Smokey are two guys in Los Angeles h...,6.083421
2,"[Horror, Action, Thriller, Crime]",From Dusk Till Dawn,Seth Gecko and his younger brother Richard are...,6.503176
3,[Comedy],Blue in the Face,"Auggie runs a small tobacco shop in Brooklyn, ...",5.109091
4,"[Action, Adventure, Science Fiction, Family, F...",Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,5.052129


In [7]:
# 다음으로, LLM에 문맥으로 제공할 모든 요소(제목, 개요, 장르, 등급)을 병합할 combined_info 라는 새 열을 만듦

# 'title', 'overview', 'genre'를 결합하여 새로운 열을 만듦
md_final['combined_info'] = md_final.apply(lambda row: f"Title: {row['title']}. Overview: {row['overview']} Genres: {', '.join(row['genres'])}. Rating: {row['weighted_rate']}", axis=1)
md_final['combined_info'][9]

'Title: Jurassic Park. Overview: A wealthy entrepreneur secretly creates a theme park featuring living dinosaurs drawn from prehistoric DNA. Before opening day, he invites a team of experts and his two eager grandchildren to experience the park and help calm anxious investors. However, the park is anything but amusing as the security systems go off-line and the dinosaurs escape. Genres: Adventure, Science Fiction. Rating: 7.39064935064935'

## 임베딩

In [9]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

True

In [10]:
# 임베딩 하는 동안 더 나은 결과를 얻을 수 있도록 영화 combined_info를 토큰화

# 임포트
import tiktoken
from openai import OpenAI

# 임베딩 모델 매개변수
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

client = OpenAI()

encoding = tiktoken.get_encoding(embedding_encoding)


def get_embedding(text, engine=embedding_model):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=engine).data[0].embedding

# 임베딩하기에 너무 긴 리뷰는 제외
md_final["n_tokens"] = md_final.combined_info.apply(lambda x: len(encoding.encode(x)))
md_final = md_final[md_final.n_tokens <= max_tokens]
len(md_final)

693

In [11]:
md_final.head()

Unnamed: 0,genres,title,overview,weighted_rate,combined_info,n_tokens
0,"[Adventure, Action, Thriller]",GoldenEye,James Bond must unmask the mysterious head of ...,6.173464,Title: GoldenEye. Overview: James Bond must un...,59
1,[Comedy],Friday,Craig and Smokey are two guys in Los Angeles h...,6.083421,Title: Friday. Overview: Craig and Smokey are ...,52
2,"[Horror, Action, Thriller, Crime]",From Dusk Till Dawn,Seth Gecko and his younger brother Richard are...,6.503176,Title: From Dusk Till Dawn. Overview: Seth Gec...,105
3,[Comedy],Blue in the Face,"Auggie runs a small tobacco shop in Brooklyn, ...",5.109091,Title: Blue in the Face. Overview: Auggie runs...,87
4,"[Action, Adventure, Science Fiction, Family, F...",Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,5.052129,Title: Mighty Morphin Power Rangers: The Movie...,89


In [12]:
md_final["embedding"] = md_final.overview.apply(lambda x: get_embedding(x, engine=embedding_model))
md_final.head()

Unnamed: 0,genres,title,overview,weighted_rate,combined_info,n_tokens,embedding
0,"[Adventure, Action, Thriller]",GoldenEye,James Bond must unmask the mysterious head of ...,6.173464,Title: GoldenEye. Overview: James Bond must un...,59,"[-0.023320907726883888, -0.016039660200476646,..."
1,[Comedy],Friday,Craig and Smokey are two guys in Los Angeles h...,6.083421,Title: Friday. Overview: Craig and Smokey are ...,52,"[0.001543616526760161, -0.01077578030526638, -..."
2,"[Horror, Action, Thriller, Crime]",From Dusk Till Dawn,Seth Gecko and his younger brother Richard are...,6.503176,Title: From Dusk Till Dawn. Overview: Seth Gec...,105,"[-0.008703107945621014, -0.004671914037317038,..."
3,[Comedy],Blue in the Face,"Auggie runs a small tobacco shop in Brooklyn, ...",5.109091,Title: Blue in the Face. Overview: Auggie runs...,87,"[-0.020313598215579987, -0.012282016687095165,..."
4,"[Action, Adventure, Science Fiction, Family, F...",Mighty Morphin Power Rangers: The Movie,Power up with six incredible teens who out-man...,5.052129,Title: Mighty Morphin Power Rangers: The Movie...,89,"[-0.0038929283618927, -0.03924328088760376, -0..."


In [13]:
# 마지막으로, 일부 명명 규칙과 데이터 유형을 수정

md_final.rename(columns = {'embedding': 'vector'}, inplace = True)
md_final.rename(columns = {'combined_info': 'text'}, inplace = True)
md_final.to_pickle('movies.pkl')

## 최종 데이터셋을 벡터DB에 저장

벡터 검색용 오픈소스 데이터베이스인 LanceDB를 활용하여 검색, 필터링 및 임베딩 관리를 크게 간소화하고 랭체인과의 기본 통합을 제공하고자 함

In [14]:
!pip install lancedb

Collecting lancedb
  Downloading lancedb-0.17.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Collecting deprecation (from lancedb)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
INFO: pip is looking at multiple versions of lancedb to determine which version is compatible with other requirements. This could take a while.
Collecting lancedb
  Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Collecting pylance==0.20.0 (from lancedb)
  Downloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.4 kB)
Collecting overrides>=0.7 (from lancedb)
  Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl (29.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.9/29.9 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl (33.5 MB)
[2K   [90m━━━━━━━━━━━

In [15]:
from langchain.vectorstores import LanceDB

In [16]:
md = pd.read_pickle('movies.pkl')
md.head(2)

Unnamed: 0,genres,title,overview,weighted_rate,text,n_tokens,vector
0,"[Adventure, Action, Thriller]",GoldenEye,James Bond must unmask the mysterious head of ...,6.173464,Title: GoldenEye. Overview: James Bond must un...,59,"[-0.023320907726883888, -0.016039660200476646,..."
1,[Comedy],Friday,Craig and Smokey are two guys in Los Angeles h...,6.083421,Title: Friday. Overview: Craig and Smokey are ...,52,"[0.001543616526760161, -0.01077578030526638, -..."


In [17]:
# 검색 시 KeyError: 'Field "metadata" does not exist in schema' 에러를 피하기 위해 metadata 필드를 추가
md['metadata'] = md.apply(lambda row: {'genres': row['genres'], 'title': row['title'], 'overview': row['overview'], 'weighted_rate': row['weighted_rate']}, axis=1)

In [18]:
import lancedb

uri = "./movie-lancedb"
db = lancedb.connect(uri)
table = db.create_table("movies", md)

  from .autonotebook import tqdm as notebook_tqdm


## 콜드 스타트 시나리오에서 QA 추천 챗봇 구축하기

LanceDB에 임베딩을 저장한 상태임.  
이제 인덱스에 대한 질문 응답을 위해 설계된 체인 구성 요소인 랭체인 RetrievalQA 검색기를 구축해보자.

여기서는 벡터 저장소를 인덱스 검색기로 사용 예정

이 체인은 "사용자의 쿼리에 대해 코사인 유사도를 거리 메트릭(기본값)으로 사용하여 가장 유사한 상위 k개의 동영상을 반환한다"는 아이디어이다.

In [19]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import LanceDB
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

embeddings = OpenAIEmbeddings()
docsearch = LanceDB(connection=db, embedding=embeddings, table_name="movies")

In [20]:
query = "액션 영화를 추천해줘"
docs = docsearch.similarity_search(query)
docs

[Document(metadata={'genres': ['Drama', 'Thriller'], 'overview': "After a 13-year imprisonment for the kidnap and murder of a 6 year old boy, beautiful Lee Guem-ja starts seeking revenge on the man that was really responsible for the boy's death. With the help of fellow inmates and reunited with her daughter, she gets closer and closer to her goal. But will her actions lead to the relief she seeks?", 'title': 'Sympathy for Lady Vengeance', 'weighted_rate': 6.091457286432162}, page_content="Title: Sympathy for Lady Vengeance. Overview: After a 13-year imprisonment for the kidnap and murder of a 6 year old boy, beautiful Lee Guem-ja starts seeking revenge on the man that was really responsible for the boy's death. With the help of fellow inmates and reunited with her daughter, she gets closer and closer to her goal. But will her actions lead to the relief she seeks? Genres: Drama, Thriller. Rating: 6.091457286432162"),
 Document(metadata={'genres': ['Comedy', 'Crime', 'Thriller'], 'overv

In [21]:
docs[0].page_content

"Title: Sympathy for Lady Vengeance. Overview: After a 13-year imprisonment for the kidnap and murder of a 6 year old boy, beautiful Lee Guem-ja starts seeking revenge on the man that was really responsible for the boy's death. With the help of fellow inmates and reunited with her daughter, she gets closer and closer to her goal. But will her actions lead to the relief she seeks? Genres: Drama, Thriller. Rating: 6.091457286432162"

In [23]:
# 유사도가 가장 높은 문서를 수집한 후에는 대화영 응답을 원하므로, RetrievalQA 에 결합

qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)

query = "액션 영화를 추천해줘"
result = qa({"query": query})
result['result']

' Hitman: Agent 47 and District B13 are both action movies that have received decent ratings.'

In [25]:
result['source_documents'][2]

Document(metadata={'genres': ['Action', 'Crime', 'Thriller'], 'overview': 'An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry.', 'title': 'Hitman: Agent 47', 'weighted_rate': 5.365800865800866}, page_content='Title: Hitman: Agent 47. Overview: An assassin teams up with a woman to help her find her father and uncover the mysteries of her ancestry. Genres: Action, Crime, Thriller. Rating: 5.365800865800866')

### MovieHarbor 시스템이 다른 변수들도 활용하도록 하려면 어떻게 해야 할까?

In [26]:
# 1. 필터 방식
#    장르가 코미디로 태그된 영화만 포함된 결과 제공

df_filtered = md[md['genres'].apply(lambda x: 'Comedy' in x)]
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'data': df_filtered}), return_source_documents=True)

query = "I'm looking for a movie with animals and an adventurous plot."
result = qa({"query": query})
result

{'query': "I'm looking for a movie with animals and an adventurous plot.",
 'result': ' Ice Age and The Curse of the Were-Rabbit both have animals and an adventurous plot.',
 'source_documents': [Document(metadata={'genres': ['Comedy', 'Family'], 'overview': 'The ongoing war between the canine and feline species is put on hold when they join forces to thwart a rogue cat spy with her own sinister plans for conquest.', 'title': 'Cats & Dogs 2 : The Revenge of Kitty Galore', 'weighted_rate': 4.978057553956835}, page_content='Title: Cats & Dogs 2 : The Revenge of Kitty Galore. Overview: The ongoing war between the canine and feline species is put on hold when they join forces to thwart a rogue cat spy with her own sinister plans for conquest. Genres: Comedy, Family. Rating: 4.978057553956835'),
  Document(metadata={'genres': ['Adventure', 'Animation', 'Comedy', 'Family'], 'overview': "Cheese-loving eccentric Wallace and his cunning canine pal, Gromit, investigate a mystery in Nick Park's a

In [27]:
# 등급이 6.5 이상인 결과만 필터링
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff", 
    retriever=docsearch.as_retriever(
        search_kwargs={'filter': "weighted_rate > 6.5"}
    ),
    return_source_documents=True
)

query = "I'm looking for a movie with animals and an adventurous plot."
result = qa({"query": query})
result

{'query': "I'm looking for a movie with animals and an adventurous plot.",
 'result': ' Ice Age is an animated movie that follows a group of prehistoric animals on an adventure to return a human infant to its parents. It has elements of comedy and family-friendly themes.',
 'source_documents': [Document(metadata={'genres': ['Animation', 'Comedy', 'Family', 'Adventure'], 'overview': 'With the impending ice age almost upon them, a mismatched trio of prehistoric critters – Manny the woolly mammoth, Diego the saber-toothed tiger and Sid the giant sloth – find an orphaned infant and decide to return it to its human parents. Along the way, the unlikely allies become friends but, when enemies attack, their quest takes on far nobler aims.', 'title': 'Ice Age', 'weighted_rate': 6.892297174111213}, page_content='Title: Ice Age. Overview: With the impending ice age almost upon them, a mismatched trio of prehistoric critters – Manny the woolly mammoth, Diego the saber-toothed tiger and Sid the gia

In [28]:
# 2. 에이전트 방식

from langchain.agents.agent_toolkits import create_retriever_tool, create_conversational_retrieval_agent
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0)
retriever = docsearch.as_retriever(return_source_documents=True)

tool = create_retriever_tool(
    retriever,
    "movies",
    "Searches and returns recommendations about movies."
)

tools = [tool]

agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True)

result = agent_executor({"input": "suggest me some action movies"})

  llm = ChatOpenAI(temperature=0)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `movies` with `{'query': 'action movies'}`


[0m[36;1m[1;3mTitle: Life Eternal. Overview: A thriller crime comedy directed by Wolfgang Murnberger. Genres: Comedy, Crime, Thriller. Rating: 5.131533477321813

Title: The Transporter Refueled. Overview: The fast-paced action movie is again set in the criminal underworld in France, where Frank Martin is known as The Transporter, because he is the best driver and mercenary money can buy. In this installment, he meets Anna and they attempt to take down a group of ruthless Russian human traffickers who also have kidnapped Frank’s father. Genres: Thriller, Action, Crime. Rating: 5.170705064548162

Title: Death Proof. Overview: Austin's hottest DJ, Jungle Julia, sets out into the night to unwind with her two friends Shanna and Arlene. Covertly tracking their moves is Stuntman Mike, a scarred rebel leering from behind the wheel of his muscle car, revving just feet away. Ge

## 프롬프트 엔지니어링

추천 시스템이라는 목표에 맞게 애플리케이션을 더욱 맞춤화할 수도 있다.  
이를 위해서는 프롬프트 엔지니어링이 필요하다.

In [29]:
# 기존 프롬프트 탐색
print(qa.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [30]:
# 각 사용자의 요청에대해 줄거리와 사용자가 좋아할만한 이유에 대한 간단할 설명과 함께 세 가지 체인을 반환하는 시스템을 만든다고 가정하면

from langchain.prompts import PromptTemplate

template = """You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
For each question, suggest three movies, with a short description of the plot and the reason why the user migth like it.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Your response:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for a funny action movie, any suggestion?"
result = qa({'query':query})
print(result['result'])

 Three possible movies that match your preferences are:
1. Deadpool: This action-packed superhero film is known for its witty humor and fourth-wall breaking jokes. It follows the story of Wade Wilson, a former Special Forces operative turned mercenary, as he seeks revenge against the man who gave him mutant powers. 
2. The Nice Guys: This buddy cop film has a perfect balance of action and comedy. Set in 1970s Los Angeles, it follows a private eye and a hired enforcer who team up to solve the case of a missing girl and a dead porn star. 
3. Hot Fuzz: This action comedy follows a top London cop who is transferred to a small town where he teams up with a hapless but eager partner. Together, they uncover a sinister conspiracy in the seemingly perfect village.


In [31]:
# 프롬프트에서 구현할 수 있는 또 다른 사항은 "대화형 예비 질문"으로, 수집한 정보를 환영 페이지로 설정하는 것
# 예를 들어, 사용자가 자연어 질문을 입력하게 하기 전에 나이, 성별, 좋아하는 영화 장르를 물어보고 싶을 수 있다.
# 이를 위해 프롬프트에 입력 변수의 형식을 사용자가 공유한 변수로 지정할 수 있는 섹션을 삽입한 다음
# 이 프롬프트 청크를 체인에 전달할 최종 프롬프트에 결함할 수 있다.

from langchain.prompts import PromptTemplate

template_prefix = """You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix= """Question: {question}
Your response:"""

user_info = user_info.format(age = 18, gender = 'female')

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 18
Gender: female
Question: {question}
Your response:


In [32]:
PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "Can you suggest me some action movie?"
result = qa({'query':query})
result['result']

' Based on your age and gender, I would recommend the movies Goldfinger and The Raid 2. Both of these movies have a high rating and fall under the action genre. They also have elements of adventure and thriller, which may appeal to you based on your preferences.'

In [33]:
result['source_documents']

[Document(metadata={'genres': ['Action', 'Thriller'], 'overview': 'Iconoclastic, take-no-prisoners cop John McClane, finds himself for the first time on foreign soil after traveling to Moscow to help his wayward son Jack - unaware that Jack is really a highly-trained CIA operative out to stop a nuclear weapons heist. With the Russian underworld in pursuit, and battling a countdown to war, the two McClanes discover that their opposing methods make them unstoppable heroes.', 'title': 'A Good Day to Die Hard', 'weighted_rate': 5.178041993422717}, page_content='Title: A Good Day to Die Hard. Overview: Iconoclastic, take-no-prisoners cop John McClane, finds himself for the first time on foreign soil after traveling to Moscow to help his wayward son Jack - unaware that Jack is really a highly-trained CIA operative out to stop a nuclear weapons heist. With the Russian underworld in pursuit, and battling a countdown to war, the two McClanes discover that their opposing methods make them unstop

## 콘텐츠 기반 시스템 구축하기

앞서 다뤘던 시스템이 사용자에 대해 아무것도 모르는 콜드 스타트 시나리오와 달리

때때로 추천 시스템은 이미 사용자에 대한 배경 지식을 가지고 있으며, 이러한 지식을 애플리케이션에 임베딩하는 것이 매우 유용하다.

이미 사용자 이름, 나이, 성별, 이미 시청한 영화가 포함된 딕셔너리와, 사용자가 부여한 평점이 포함된 경우를 해보자

In [34]:
# 샘플 데이터셋을 만들어보자

import pandas as pd

data = {
    "username": ["Alice", "Bob"],
    "age": [25, 32],
    "gender": ["F", "M"],
    "movies": [
        [("Transformers: The Last Knight", 7), ("Pokémon: Spell of the Unknown", 5)],
        [("Bon Cop Bad Cop 2", 8), ("Goon: Last of the Enforcers", 9)]
    ]
}

# "movies" 열을 딕셔너리로 변환
for i, row_movies in enumerate(data["movies"]):
    movie_dict = {}
    for movie, rating in row_movies:
        movie_dict[movie] = rating
    data["movies"][i] = movie_dict

# 판다스 데이터프레임 생성
df = pd.DataFrame(data)

df.head()

Unnamed: 0,username,age,gender,movies
0,Alice,25,F,"{'Transformers: The Last Knight': 7, 'Pokémon:..."
1,Bob,32,M,"{'Bon Cop Bad Cop 2': 8, 'Goon: Last of the En..."


In [35]:
# 콜드 스타트 프롬프트의 로직을 변수를 사용해 서식과 동일하게 적용

template_prefix = """You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}
Movies already seen alongside with rating: {movies}"""

template_suffix= """Question: {question}
Your response:"""

In [36]:
age = df.loc[df['username']=='Alice']['age'][0]
gender = df.loc[df['username']=='Alice']['gender'][0]

movies = ''
# 딕셔너리를 순회하여 영화 제목과 평점을 출력
for movie, rating in df['movies'][0].items():
    output_string = f"Movie: {movie}, Rating: {rating}" + "\n"
    movies+=output_string
    #print(output_string)
user_info = user_info.format(age = age, gender = gender, movies = movies)

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

You are a movie recommender system that help users to find movies that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 25
Gender: F
Movies already seen alongside with rating: Movie: Transformers: The Last Knight, Rating: 7
Movie: Pokémon: Spell of the Unknown, Rating: 5

Question: {question}
Your response:


In [37]:
# 이제 이 프롬프트를 체인 내에서 사용해보자.

PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "Can you suggest me some action movie based on my background?"
result = qa({'query':query})
result['result']

' Based on your age, gender, and the movies you have already seen and rated, I would recommend the following action movies for you: \n1. John Wick (rated 7.5) - This movie follows an ex-hitman who seeks revenge against the people who wronged him. It has intense action scenes and a strong female character. \n2. The Bourne Identity (rated 7.9) - This movie follows an amnesiac assassin who tries to uncover his past and evade his former employers. It has a strong female lead and thrilling action sequences. \n3. Atomic Blonde (rated 6.7) - This movie follows an MI6 agent who is sent to Berlin to retrieve a missing list of double agents. It has intense action scenes and a strong female protagonist. \n4. Edge of Tomorrow (rated 7.9) - This movie follows a soldier who relives the same day over and over again in a war against aliens. It has a strong female character and intense action sequences. \n5. Kill Bill: Volume 1 (rated 8.1) - This movie follows a former assassin who seeks revenge agains

In [38]:
result['source_documents']

[Document(metadata={'genres': ['Action', 'Crime', 'Thriller'], 'overview': 'After fighting his way through an apartment building populated by an army of dangerous criminals and escaping with his life, SWAT team member Rama goes undercover, joining a powerful Indonesian crime syndicate to protect his family and uncover corrupt members of his own force.', 'title': 'The Raid 2', 'weighted_rate': 6.7086887835703}, page_content='Title: The Raid 2. Overview: After fighting his way through an apartment building populated by an army of dangerous criminals and escaping with his life, SWAT team member Rama goes undercover, joining a powerful Indonesian crime syndicate to protect his family and uncover corrupt members of his own force. Genres: Action, Crime, Thriller. Rating: 6.7086887835703'),
 Document(metadata={'genres': ['Adventure', 'Action', 'Thriller'], 'overview': 'Moments from death a young man is rescued by a renowned warrior. Realizing unsurpassed physical potential in the young boy he

프로덕션 시나리오에서 처리할 과업(예: 추천 과업)과 관련된 변수를 저장하는 모범 사례는 피처 스토어를 사용하는 것입니다.

피처스토어는 머신러닝 워크플로우를 지원하도록 설계된 데이터 시스템