In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# cd llm_chatgpt_study/chatgpt_plus_pdf/

/content/drive/MyDrive/intern/llm_chatgpt_study/chatgpt_plus_pdf


# 질의응답
- 참고문헌 : https://www.youtube.com/watch?v=TLf90ipMzfE
- colab 자료 : https://colab.research.google.com/drive/181BSOH6KF_1o2lFG8DQ6eJd2MZyiSBNt?usp=sharing

## 필요 모듈 임포트 및 경로지정

In [None]:
!pip install -r requirements.txt



In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

In [None]:
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
import openai
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env.local") #.env.local에 openai.api_key가 저장되어 있음. 환경변수로 사용함.
api_key=os.getenv("OPENAI_API_KEY")
openai.api_key = api_key
MODEL = "gpt-3.5-turbo"

## 파일 로드

In [None]:
# location of the pdf file/files.
reader = PdfReader('./data/2023_GPT4All_Technical_Report.pdf')

In [None]:
reader

<PyPDF2._reader.PdfReader at 0x7daf1184d9f0>

## 페이지 속 텍스트 모두 합치기

In [None]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

## raw_text 분할

In [None]:
raw_text[:100]

'GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nY'

In [None]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.

text_splitter = CharacterTextSplitter(
    separator = "\n", #텍스트 분할 구분자
    chunk_size = 1000, #분할된 조각의 크기
    chunk_overlap  = 200, #분할된 조각 간 겹치는 부분의 크기
    length_function = len, #len을 사용해 길이 측정
)
texts = text_splitter.split_text(raw_text)

In [None]:
len(texts)

8

In [None]:
texts[0]

'GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nYuvanesh Anand\nyuvanesh@nomic.aiZach Nussbaum\nzanussbaum@gmail.com\nBrandon Duderstadt\nbrandon@nomic.aiBenjamin Schmidt\nben@nomic.aiAndriy Mulyar\nandriy@nomic.ai\nAbstract\nThis preliminary technical report describes the\ndevelopment of GPT4All, a chatbot trained\nover a massive curated corpus of assistant in-\nteractions including word problems, story de-\nscriptions, multi-turn dialogue, and code. We\nopenly release the collected data, data cura-\ntion procedure, training code, and final model\nweights to promote open research and repro-\nducibility. Additionally, we release quantized\n4-bit versions of the model allowing virtually\nanyone to run the model on CPU.\n1 Data Collection and Curation\nWe collected roughly one million prompt-\nresponse pairs using the GPT-3.5-Turbo OpenAI\nAPI between March 20, 2023 and March 26th,\n2023. To do this, we first gathered a diverse sam-'

In [None]:
texts[1]

'We collected roughly one million prompt-\nresponse pairs using the GPT-3.5-Turbo OpenAI\nAPI between March 20, 2023 and March 26th,\n2023. To do this, we first gathered a diverse sam-\nple of questions/prompts by leveraging three pub-\nlicly available datasets:\n• The unified chip2 subset of LAION OIG.\n• Coding questions with a random sub-sample\nof Stackoverflow Questions\n• Instruction-tuning with a sub-sample of Big-\nscience/P3\nWe chose to dedicate substantial attention to data\npreparation and curation based on commentary in\nthe Stanford Alpaca project (Taori et al., 2023).\nUpon collection of the initial dataset of prompt-\ngeneration pairs, we loaded data into Atlas for data\ncuration and cleaning. With Atlas, we removed all\nexamples where GPT-3.5-Turbo failed to respond\nto prompts and produced malformed output. This\nreduced our total number of examples to 806,199\nhigh-quality prompt-generation pairs. Next, we\ndecided to remove the entire Bigscience/P3 sub-'

## 텍스트 데이터 임베딩 및 검색 인덱스 생성

In [None]:
import sys
print(sys.version)

3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]


In [None]:
# Download embeddings from OpenAI => 텍스트 데이터를 벡터 형식으로 변환
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

#검색 인덱스 생성
docsearch = FAISS.from_texts(texts, embeddings)#FAISS : 고차원 데이터 검색에 유용, 임베딩된 텍스트 데이터 효율적으로 관리 및 검색 도움
docsearch

<langchain.vectorstores.faiss.FAISS at 0x7daf10ce13f0>

## 질문응답 체인 로드 및 질문응답 예시 실행

In [None]:
# 질문응답 체인 로드
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.'

In [None]:
query = "이 글을 쓴 저자가 누구야?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, Andriy Mulyar'

In [None]:
query = "GPT4all 모델의 가격은 얼마니?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' GPT4all 모델을 훈련하는데 약 8시간의 시간과 Lambda Labs DGX A100 8x 80GB를 사용했을 때 총 비용은 $100입니다.'

In [None]:
query = "What was the cost of training the GPT4all model?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' $100 to train the GPT4all model on a Lambda Labs DGX A100 8x 80GB.'

In [None]:
query = "How was the model trained?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The model was trained with LoRA (Hu et al., 2021) on the 437,605 post-processed examples for four epochs.'

In [None]:
query = "what was the size of the training dataset?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The final training dataset contained 437,605 prompt-generation pairs.'

In [None]:
query = "How is this different from other models?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" GPT4All is a low-rank adaptation of a large language model, based on LLaMA, and is intended for research purposes only. It is different from other models, such as Stanford Alpaca, because it has a non-commercial license and is based on OpenAI's GPT-3.5-Turbo, which has terms of use that prohibit developing models that compete commercially with OpenAI."

In [None]:
query = "What is Google Bard?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" I don't know."

# 요약
- 참고문헌 : https://teddylee777.github.io/langchain/langchain-tutorial-07/

## 파일 로드 및 분할
- 질의응답과 `문서분할`에서 조금 다르다.
  - 질의응답은 "CharacterTextSplitter.split_text(raw_text)"사용하나, 여기서는 "CharacterTextSplitter.from_tiktoken_encoder.split_documents()"를 사용한다.

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("./data/2023_GPT4All_Technical_Report.pdf")
document = loader.load()
document[0].page_content[:200]

'GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nYuvanesh Anand\nyuvanesh@nomic.aiZach Nussbaum\nzanussbaum@gmail.com\nBrandon Duderstadt\nbrandon@nomic.a'

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# 스플리터 지정
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n\n",  # 분할기준
    chunk_size=3000,   # 사이즈
    chunk_overlap=500, # 중첩 사이즈
)

# 분할 실행
split_docs = text_splitter.split_documents(document)

# 총 분할된 도큐먼트 수
len(split_docs)

3

In [None]:
split_docs[0]

Document(page_content='GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nYuvanesh Anand\nyuvanesh@nomic.aiZach Nussbaum\nzanussbaum@gmail.com\nBrandon Duderstadt\nbrandon@nomic.aiBenjamin Schmidt\nben@nomic.aiAndriy Mulyar\nandriy@nomic.ai\nAbstract\nThis preliminary technical report describes the\ndevelopment of GPT4All, a chatbot trained\nover a massive curated corpus of assistant in-\nteractions including word problems, story de-\nscriptions, multi-turn dialogue, and code. We\nopenly release the collected data, data cura-\ntion procedure, training code, and final model\nweights to promote open research and repro-\nducibility. Additionally, we release quantized\n4-bit versions of the model allowing virtually\nanyone to run the model on CPU.\n1 Data Collection and Curation\nWe collected roughly one million prompt-\nresponse pairs using the GPT-3.5-Turbo OpenAI\nAPI between March 20, 2023 and March 26th,\n2023. To do this, we first gat

In [None]:
len(split_docs[0].page_content)

2794

## 분할된 각 문서에 대한 요약 실행

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

# Map 단계에서 처리할 프롬프트 정의
# 분할된 문서에 적용할 프롬프트 내용을 기입합니다.
# 여기서 {pages} 변수에는 분할된 문서가 차례대로 대입되니다.
map_template = """다음은 문서 중 일부 내용입니다
{pages}
이 문서 목록을 기반으로 주요 내용을 요약해 주세요.
답변:"""

# Map 프롬프트 완성
map_prompt = PromptTemplate.from_template(map_template)

# Map에서 수행할 LLMChain 정의
llm = ChatOpenAI(temperature=0,
                 model_name=MODEL)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

## 각 문서의 요약본에 대한 통합

In [None]:
# Reduce 단계에서 처리할 프롬프트 정의
reduce_template = """다음은 요약의 집합입니다:
{doc_summaries}
이것들을 바탕으로 통합된 요약을 만들어 주세요.
답변:"""

# Reduce 프롬프트 완성
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Reduce에서 수행할 LLMChain 정의
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import ReduceDocumentsChain

# 문서의 목록을 받아들여, 이를 단일 문자열로 결합하고, 이를 LLMChain에 전달합니다.
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name="doc_summaries" # Reduce 프롬프트에 대입되는 변수
)

# Map 문서를 통합하고 순차적으로 Reduce합니다.
reduce_documents_chain = ReduceDocumentsChain(
    # 호출되는 최종 체인입니다.
    combine_documents_chain=combine_documents_chain,
    # 문서가 `StuffDocumentsChain`의 컨텍스트를 초과하는 경우
    collapse_documents_chain=combine_documents_chain,
    # 문서를 그룹화할 때의 토큰 최대 개수입니다.
    token_max=4000,
)

## 통합체인 생성

In [None]:
from langchain.chains import MapReduceDocumentsChain

# 문서들에 체인을 매핑하여 결합하고, 그 다음 결과들을 결합합니다.
map_reduce_chain = MapReduceDocumentsChain(
    # Map 체인
    llm_chain=map_chain,
    # Reduce 체인
    reduce_documents_chain=reduce_documents_chain,
    # 문서를 넣을 llm_chain의 변수 이름(map_template 에 정의된 변수명)
    document_variable_name="pages",
    # 출력에서 매핑 단계의 결과를 반환합니다.
    return_intermediate_steps=False,
)

## 요약본 출력

In [None]:
split_docs

[Document(page_content='GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nYuvanesh Anand\nyuvanesh@nomic.aiZach Nussbaum\nzanussbaum@gmail.com\nBrandon Duderstadt\nbrandon@nomic.aiBenjamin Schmidt\nben@nomic.aiAndriy Mulyar\nandriy@nomic.ai\nAbstract\nThis preliminary technical report describes the\ndevelopment of GPT4All, a chatbot trained\nover a massive curated corpus of assistant in-\nteractions including word problems, story de-\nscriptions, multi-turn dialogue, and code. We\nopenly release the collected data, data cura-\ntion procedure, training code, and final model\nweights to promote open research and repro-\nducibility. Additionally, we release quantized\n4-bit versions of the model allowing virtually\nanyone to run the model on CPU.\n1 Data Collection and Curation\nWe collected roughly one million prompt-\nresponse pairs using the GPT-3.5-Turbo OpenAI\nAPI between March 20, 2023 and March 26th,\n2023. To do this, we first ga

In [None]:
# Map-Reduce 체인 실행
# 입력: 분할된 도큐먼트
result = map_reduce_chain.run(split_docs)
# 요약결과 출력
print(result)

GPT4All은 대규모 데이터를 사용하여 훈련된 챗봇으로, 다양한 상호작용을 포함한 데이터를 사용하여 훈련되었습니다. 데이터 수집 및 정제 과정에서는 GPT-3.5-Turbo OpenAI API를 사용하여 프롬프트-응답 쌍을 수집하고, 훈련에서는 여러 모델을 사용하여 파인튜닝을 진행했습니다. 이 문서는 GPT4All 모델에 대한 정보와 재현성, 비용, 평가, 사용 고려 사항에 대한 내용을 제공합니다. 또한, 다양한 언어 모델에 대한 참조도 제공됩니다.


In [None]:
#