## 1. Install Dependencies and Import Libraries

In [1]:
import os
from typing import Dict, List, Optional, Tuple

import openai
import pinecone
import pandas as pd
import numpy as np
import textwrap

from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone

  from tqdm.autonotebook import tqdm


## 2. Configure Your OpenAI API Key

In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

openai.api_key = OPENAI_API_KEY

## 3. Configure Your Pinecone API Key and Environment

In [3]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT")
PINECONE_INDEX = os.environ.get("PINECONE_INDEX")

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

## 4. Run Your Question-Answering Service

In [4]:
class OpenAIEmbeddingsWrapper(OpenAIEmbeddings):
    query_text_to_embedding: Dict[str, List[float]] = {}
    document_text_to_embedding: Dict[str, List[float]] = {}

    def embed_query(self, text: str) -> List[float]:
        embedding = super().embed_query(text)
        self.query_text_to_embedding[text] = embedding
        return embedding

    def embed_documents(self, texts: List[str], chunk_size: Optional[int] = 0) -> List[List[float]]:
        embeddings = super().embed_documents(texts, chunk_size)
        for text, embedding in zip(texts, embeddings):
            self.document_text_to_embedding[text] = embedding
        return embeddings

    @property
    def query_embedding_dataframe(self) -> pd.DataFrame:
        return self._convert_text_to_embedding_map_to_dataframe(self.query_text_to_embedding)

    @property
    def document_embedding_dataframe(self) -> pd.DataFrame:
        return self._convert_text_to_embedding_map_to_dataframe(self.document_text_to_embedding)

    @staticmethod
    def _convert_text_to_embedding_map_to_dataframe(
        text_to_embedding: Dict[str, List[float]]
    ) -> pd.DataFrame:
        texts, embeddings = map(list, zip(*text_to_embedding.items()))
        embedding_arrays = [np.array(embedding) for embedding in embeddings]
        return pd.DataFrame.from_dict(
            {
                "text": texts,
                "text_vector": embedding_arrays,
            }
        )

In [5]:
class PineconeWrapper(Pinecone):
    query_text_to_document_score_tuples: Dict[str, List[Tuple[Document, float]]] = {}

    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
        filter: Optional[dict] = None,
        namespace: Optional[str] = None,
    ) -> List[Tuple[Document, float]]:
        document_score_tuples = super().similarity_search_with_score(
            query=query,
            k=k,
            filter=filter,
            namespace=namespace,
        )
        self.query_text_to_document_score_tuples[query] = document_score_tuples
        return document_score_tuples

    @property
    def retrieval_dataframe(self) -> pd.DataFrame:
        query_texts = []
        document_texts = []
        retrieval_ranks = []
        scores = []
        for query_text, document_score_tuples in self.query_text_to_document_score_tuples.items():
            for retrieval_rank, (document, score) in enumerate(document_score_tuples):
                query_texts.append(query_text)
                document_texts.append(document.page_content)
                retrieval_ranks.append(retrieval_rank)
                scores.append(score)
        return pd.DataFrame.from_dict(
            {
                "query_text": query_texts,
                "document_text": document_texts,
                "retrieval_rank": retrieval_ranks,
                "score": scores,
            }
        )

In [6]:
num_retrieved_documents = 2
embeddings = OpenAIEmbeddingsWrapper(model=EMBEDDING_MODEL)
docsearch = PineconeWrapper.from_existing_index(
    index_name=PINECONE_INDEX,
    embedding=embeddings,
)
llm = ChatOpenAI(model_name=GPT_MODEL)
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={"k": num_retrieved_documents}),
)

In [11]:
query_text = "아는사람에게 명변할 수 있어?"
# query_text = "유심기변상태로 단말기변경이력조회에서는 사용중/교반품신청등록으로 상태가 보여지는것으로 확정기변이 가능한가요?"

response_text = chain.run(query_text)
retrievals_df = docsearch.retrieval_dataframe.tail(num_retrieved_documents)
contexts = retrievals_df["document_text"].to_list()
scores = retrievals_df["score"].to_list()
query_embedding = embeddings.query_embedding_dataframe["text_vector"].iloc[-1]

print("Response")
print("========")
print()
for line in textwrap.wrap(response_text.strip(), width=80):
    print(line)
print()

for context_index, (context, score) in enumerate(zip(contexts, scores)):
    print(f"Retrieved Context {context_index}")
    print("===================")
    print()

    for line in textwrap.wrap(context.strip(), width=80):
        print(line)
    print()
    print(f"score: {score}")
    print()

print("Query Embedding")
print("===============")
print()

print(query_embedding)
print()
print(f"dimension: {len(query_embedding)}")

Response

네, 가족 간의 명의변경은 가능합니다. 하지만 개인회생 중인 경우에는 명의변경이 제한됩니다. 따라서, 개인회생 중이 아니라면 가족으로서
명의변경을 신청할 수 있을 것입니다.

Retrieved Context 0

명의변경 외에도 특별한 사례로 가족 간 명의변경이 허용될 수 있습니다. 양도인이 개인회생 중인 경우에는 명의변경이 제한됩니다. 업무 처리 시
SMS 발송:  양도인에게는 양도인이 보유한 사용 중인 SKT 회선으로 SMS가 발송됩니다. 양수인에게는 명의가 변경된 회선으로 SMS가
발송됩니다. FAQ:  2016년 3월 17일부터 친구 명의로의 변경은 불가능합니다. 가족 간만 가능합니다. 명의변경 제한 관련 내용은
'이용약관'

score: 0.828158379

Retrieved Context 1

을 통해 업무를 처리할 수 있습니다. 기타: 개인회생 중인 경우 명의변경이 제한됩니다.

score: 0.823480189

Query Embedding

[-0.00910919 -0.02915456  0.01682885 ... -0.01017065  0.00894837
 -0.01801253]

dimension: 1536
