<a href="https://colab.research.google.com/github/yenjannn/projects/blob/master/Demo_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 用ChatGPT輕鬆掌握外資對台積電法說會的看法
[參考影片連結](https://youtu.be/pydzRA9Kyfo)     
[GitHub程式碼](https://github.com/ywchiu/largitdata/blob/master/code/Course_223.ipynb)     

[其他參考連結(多個資訊來源)](https://betterprogramming.pub/how-to-build-your-own-custom-chatgpt-with-custom-knowledge-base-4e61ad82427e)

### 安裝`langchain`和`LLama_index`等必要套件

In [None]:
!pip install langchain

In [None]:
!pip install llama_index

In [None]:
!pip install pypdf

In [None]:
import nltk
nltk.download('punkt')

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import NLTKTextSplitter

from llama_index.node_parser import SimpleNodeParser
from llama_index import Document
from llama_index import GPTVectorStoreIndex

import os
os.environ['OPENAI_API_KEY'] = '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'

In [None]:
!pip install gradio

In [7]:
import gradio as gr

### 讀取外資PDF報告

In [None]:
!gdown https://drive.google.com/uc?id=1Z_ww5ZASdIq0uZrg8jt_d7E_PzFQdYH-

In [None]:
!unzip /content/TSMC2023Q1.zip

### Sample其中一份

In [None]:
# from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader('TSMC2023Q1/20230330 TSMC Citi.pdf')
pages = loader.load_and_split()

In [None]:
# from langchain.text_splitter import NLTKTextSplitter
splitter = NLTKTextSplitter(chunk_size = 1000, chunk_overlap = 200 )
chunks = splitter.split_text(pages[0].page_content)

In [None]:
for chunk in chunks:
    print(chunk)
    print('=' * 50 )

In [None]:
# from llama_index.node_parser import SimpleNodeParser
parser = SimpleNodeParser(text_splitter = splitter)

#### 建立 Document 與 Nodes

In [None]:
from llama_index import Document
docs = Document(pages[3].page_content)
nodes = parser.get_nodes_from_documents([docs])

In [None]:
for node in nodes:
    node.extra_info = {'Document': 'citi'}

#### 將 Nodes 插入 Index 中

In [None]:
# from llama_index import GPTSimpleVectorIndex
# import os
# os.environ['OPENAI_API_KEY'] = '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'

In [None]:
index = GPTSimpleVectorIndex([])
index.insert_nodes(nodes)

In [None]:
nodes[0]

#### 查詢外資報告資料

In [None]:
aa = index.query('#zh-tw 請問citi 對台積電的看法? 請用正體中文')

In [None]:
aa

In [None]:
print(aa.response)

### 將所有外資報告鍵入索引中

In [None]:
def build_nodes(f):
    loader = PyPDFLoader(f)
    pages = loader.load_and_split()

    splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap=200)
    parser = SimpleNodeParser(text_splitter=splitter)

    docs = Document(pages[0].page_content)
    nodes = parser.get_nodes_from_documents([docs])
    for e in nodes:
        e.extra_info= {'Document':f}
    return nodes

In [None]:
import glob

In [None]:
for f in glob.glob('TSMC2023Q1/*'):
    print(f)

In [None]:
index = GPTVectorStoreIndex([]) # 做初始化
for f in glob.glob('TSMC2023Q1/*'):
    nodes = build_nodes(f)
    index.insert_nodes(nodes)

#### 加入CTBC年報(110年度英文版)

In [26]:
loader = PyPDFLoader('/content/CTBC_AR_Y110_en.pdf')
pages = loader.load_and_split()

In [None]:
# # 看一下長相
# print("整頁長相:\n", pages[3])
# print('-'*30)
# print("內容長度:\n", len(pages[3].page_content))
# print('-'*30)
# print("整頁內容:\n",pages[3].page_content)

In [27]:
splitter = NLTKTextSplitter(chunk_size = 1000, chunk_overlap = 200 )
chunks = splitter.split_text(pages[3].page_content)

In [None]:
# for chunk in chunks:
#     print(chunk)
#     print('=' * 50 )

In [28]:
parser = SimpleNodeParser(text_splitter = splitter)

In [29]:
docs = Document(pages[3].page_content)
nodes = parser.get_nodes_from_documents([docs])

In [30]:
for node in nodes:
    node.extra_info = {'Document': 'CTBC'}

In [31]:
index.insert_nodes(nodes)

### 讀入已經建立好的向量資料庫(台積電報告及中信年報)
記得先上傳storage資料夾中的.json檔案們

In [33]:
# # save index to disk
# index.set_index_id("vector_index")
# index.storage_context.persist('./storage')

In [8]:
from llama_index import load_index_from_storage, StorageContext

In [9]:
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir='storage')
# load index
index = load_index_from_storage(storage_context, index_id="vector_index")

### 查詢外資對台積電的看法

In [None]:
# 查詢模型及max_token的神祕入口(但還沒找到)
# /usr/local/lib/python3.10/dist-packages/llama_index/indices/response/refine.py

In [14]:
query_engine = index.as_query_engine()

In [24]:
response = query_engine.query("請問HSBC對台積電的看法? 請用正體中文")
print(response)


HSBC對台積電持保留評級，並將目標價格從TWD 560提高至TWD 643。HSBC認為1Q23收入未能達到預期，並預期2Q23也將繼續疲弱，但不排除可能出現投資計劃和擴張的減緩。


In [None]:
# query_engine = index.as_query_engine()
# response = query_engine.query("What did the author do growing up?")
# print(response)

# response = query_engine.query("Write an email to the user given their background information.")
# print(response)

In [None]:
question = input('Input:')
prompt = str(question) + '請用正體中文'
dd = index.query(prompt)
print(dd.response)

### 用`Gradio`建立Demo

In [None]:
# gr.Interface.load("huggingface/ckiplab/gpt2-base-chinese").launch(share=True)

In [None]:
# def chatTSMC(question):
#     prompt = str(question) + '請用正體中文'
#     dd = index.query(prompt)
#     return dd.response

In [15]:
def chatTSMC(question):
    prompt = '你好ChatGPT。請假設你是一位銀行的理專，客戶詢問你：'+str(question) + '請在100個字內用正體中文回答客戶並提出資訊來源'
    response = query_engine.query(prompt)
    return response

#### 目前可以測試的句型為：請問"銀行名稱"對台積電的看法？
已經建立INDEX的銀行名單：
* Citi, UBS, MS, DW, HTI, JPM, GS, CL, NMR, HSBC

In [None]:
# 未加入年報的版本
# gr.Interface(chatTSMC, inputs='textbox', outputs='textbox').launch(share=True)
# gr.Interface(fn = chatTSMC, 
#              inputs='textbox', 
#              outputs='textbox', 
#              title='ChatTSMC',).launch(share=True)

In [16]:
gr.Interface(fn = chatTSMC, 
             inputs='textbox', 
             outputs='textbox', 
             title='歡迎使用ChatTSMC&CTBC',
             description='我可以快速回應你Citi, UBS, Morgan Stanley等國際大行對台積電法說會的最新看法\n目前提供的國際大行有Citi, UBS, MS, DW, HTI, JPM, GS, CL, NMR及HSBC\n也許可以試試看CTBC年報的內容').launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6b6ab4d1666fb833cb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




In [None]:
## try Gradio 對話機器人模式的function寫法
# history = []
# question = '請問Citi對台積電的看法?'
# prompt = '你好ChatGPT。請假設你是一位銀行的理專，客戶詢問你：'+str(question) + '請用正體中文回答客戶並提出資訊來源'
# response = query_engine.query(prompt)
# history.append((question, response))
# question
# type(response)
# response.response
# print(history, history)

In [None]:
# prompt = '你好ChatGPT。請假設你是一位銀行的理專，客戶詢問你：'+str(question) + '請用正體中文回答客戶並提出資訊來源，回答字數請嚴格縮短在100個字內'
# prompt = '你好ChatGPT。請假設你是一位銀行的理專，客戶詢問你：'+str(question) + '請用正體中文回答客戶並提出資訊來源'

In [22]:
def chat(question, history):
    history = history or []
    prompt = '你好ChatGPT。請假設你是一位銀行的理專，客戶詢問你：'+str(question) + '請用正體中文回答客戶並提出資訊來源，回答字數請嚴格縮短在100個字內'
    response = query_engine.query(prompt)
    history.append((question, response.response))
    return history, history

In [23]:
#設定一個對話框
chatbot = gr.Chatbot().style(color_map=("green", "pink"))
gr.Interface(
    chat,
    # 新增state元件
    ["text", "state"],
    [chatbot, "state"],
    # 設定沒有儲存資料的按鈕
    allow_flagging="never",
    title='歡迎使用ChatTSMC&CTBC',
    description='我可以快速回應你Citi, UBS, Morgan Stanley等國際大行對台積電法說會的最新看法\n目前提供的國際大行有Citi, UBS, MS, DW, HTI, JPM, GS, CL, NMR及HSBC\n也許可以試試看CTBC年報的內容').launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://1e1a1a270e1616e1af.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




## Databricks/dolly
[Model Description_HuggingFace](https://huggingface.co/databricks/dolly-v2-12b)     
[GitHub Repo](https://github.com/databrickslabs/dolly#getting-started-with-response-generation)

In [None]:
!pip install transformers

In [None]:
!pip install accelerate

In [None]:
from transformers import pipeline
import torch

In [None]:
# instruct_pipeline = pipeline(model="databricks/dolly-v2-12b", 
#                              torch_dtype=torch.bfloat16, 
#                              trust_remote_code=True, 
#                              device_map="auto")

In [None]:
# generate_text = pipeline(model="databricks/dolly-v2-12b", 
#                          torch_dtype=torch.bfloat16, 
#                          trust_remote_code=True, 
#                          device_map="auto")

In [None]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


In [None]:
!apt install nvidia-driver-<version>

In [None]:
generate_text = pipeline(model="databricks/dolly-v2-7b", 
                         torch_dtype=torch.bfloat16,
                         trust_remote_code=True, 
                         device_map="auto", 
                         return_full_text=True)

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline

# template for an instrution with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}")

# template for an instruction with input
prompt_with_context = PromptTemplate(
    input_variables=["instruction", "context"],
    template="{instruction}\n\nInput:\n{context}")

hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)

https://python.langchain.com/en/latest/modules/chains.html     
https://python.langchain.com/en/latest/modules/models/llms/integrations/huggingface_hub.html     

In [None]:
!pip install huggingface_hub > /dev/null

In [None]:
from getpass import getpass

HUGGINGFACEHUB_API_TOKEN = getpass()
# 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

In [None]:
import os
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'

In [None]:
from langchain import HuggingFaceHub
from langchain import PromptTemplate, LLMChain

In [None]:
repo_id = "databricks/dolly-v2-7b"

llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":0, "max_length":64})

In [None]:
template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

question = "Who won the FIFA World Cup in the year 1994? "

print(llm_chain.run(question))

KeyboardInterrupt: ignored

In [None]:
# Reuse the prompt and question from above.
llm_chain = LLMChain(prompt=prompt, llm=llm)
print(llm_chain.run(question))

### 順便玩玩看Dolly測試一下他的表現

In [None]:
gr.Interface.load("huggingface/databricks/dolly-v2-12b").launch()

In [None]:
%ls

CTBC_AR_Y110_en.pdf  [0m[01;34mflagged[0m/  [01;34msample_data[0m/  [01;34mTSMC2023Q1[0m/  TSMC2023Q1.zip
