### 金融政策決定会合のDLと要約

In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
import os
import json
import requests
import PyPDF2
import boto3
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
# PDFファイルのURL
pdf_url = 'https://www.boj.or.jp/mopo/mpmsche_minu/minu_2023/g230310.pdf'

# PDFファイルのダウンロード
response = requests.get(pdf_url)

pdf_filename = 'document.pdf'
with open(pdf_filename, 'wb') as file:
    file.write(response.content)

In [3]:
#設定
os.environ["OPENAI_API_TYPE"] = 'OPENAI_API_TYPE' 
os.environ["OPENAI_API_BASE"] = 'OPENAI_API_BASE' 
os.environ["OPENAI_API_KEY"] =  'OPENAI_API_KEY'

DEPLOYMENT_NAME_gpt35  = 'DEPLOYMENT_NAME'
MODEL_NAME_gpt35       = 'MODEL_NAME'
OPEN_API_VERSION_gpt35 = 'OPEN_API_VERSION'

#embededの設定
DEPLOYMENT_NAME_embeded = 'DEPLOYMENT_NAME'
MODEL_NAME_embeded      = 'MODEL_NAME'
os.environ["OPENAI_EMBEDDINGS_DEPLOYMENT"] = DEPLOYMENT_NAME_embeded

In [4]:
#言語モデル
llm = AzureChatOpenAI(
    openai_api_version=OPEN_API_VERSION_gpt35,
    deployment_name=DEPLOYMENT_NAME_gpt35,
    temperature=0, 
    max_tokens=1000 
)

#文字セパレート
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 2000,
    chunk_overlap = 0,
    length_function = len,
)

#ファイル分割
def pdf_reader(open_pdf_file):
    read_pdf = PyPDF2.PdfReader(open_pdf_file)
    num_pages = len(read_pdf.pages)
    text = ""
    for page_number in range(num_pages): 
        page = read_pdf.pages[page_number]
        text += page.extract_text()
    return text

In [5]:
#PDF読み込みと分割
state_of_the_union=pdf_reader(pdf_filename)
texts = text_splitter.split_text(state_of_the_union)
docs = [Document(page_content=t) for t in texts]

### 要約

In [6]:
template = """
次の文章を日本語で簡潔に要約してください。
文章：{text}
"""

PROMPT = PromptTemplate(
    input_variables=["text"],
    template=template,
)
chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=PROMPT, combine_prompt=PROMPT,verbose=False)

summary = chain.run(docs)

In [18]:
#summary

### センチメント分析

In [8]:
#Amazon Translateで日本語から英語に翻訳
def translate_text(text):
    translate = boto3.client('translate')
    response = translate.translate_text(
        Text=text,
        SourceLanguageCode='ja',
        TargetLanguageCode='en'
    )
    translated_text = response['TranslatedText']
    return translated_text

In [9]:
summary_en = translate_text(summary)

In [19]:
#summary_en

In [11]:
# ProsusAI finbertによりセンチメント分析
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [12]:
def finsentiment(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    predictions = outputs.logits.softmax(dim=1)
    sentiment = predictions.argmax().item()
    if sentiment == 0:
        sent_text = "Positive"
    elif sentiment == 1:
        sent_text = "Negative"
    elif sentiment == 2:
        sent_text = "Neutral"
    return predictions[0].tolist(),sent_text

In [13]:
finsentiment(summary_en)

([0.8086580634117126, 0.02010824717581272, 0.17123371362686157], 'Positive')

### QAボット構築準備

In [14]:
#pdf読み込み
loader = PyPDFLoader(pdf_filename)
documents = loader.load()

In [15]:
#テキスト分割
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
#ベクトル化＆Chroma(ベクター用DB）
embeddings = OpenAIEmbeddings(chunk_size=1)
vectordb = Chroma.from_documents(documents=texts, embedding=embeddings)
#ベクターDB検索gpt35
qa = RetrievalQA.from_chain_type(llm = AzureChatOpenAI( openai_api_version=OPEN_API_VERSION_gpt35,deployment_name=DEPLOYMENT_NAME_gpt35,temperature=0), chain_type="stuff", retriever=vectordb.as_retriever())

### Q&A

In [16]:
#プロンプト定義
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer in Japanese:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [20]:
query = "インフレ分析の観点でこの文書から低インフレ、高インフレのどっちかを評価して。"
#print(qa.run(query))