##  安裝 langchain 與 LLama_index 等必要套件

In [None]:
! pip install langchain

In [None]:
! pip install llama_index

In [None]:
! pip install pypdf

In [62]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 讀取外資PDF報告 

In [53]:
! gdown https://drive.google.com/uc?id=1Z_ww5ZASdIq0uZrg8jt_d7E_PzFQdYH-

Downloading...
From: https://drive.google.com/uc?id=1Z_ww5ZASdIq0uZrg8jt_d7E_PzFQdYH-
To: /content/TSMC2023Q1.zip
  0% 0.00/10.7M [00:00<?, ?B/s] 84% 8.91M/10.7M [00:00<00:00, 85.6MB/s]100% 10.7M/10.7M [00:00<00:00, 93.3MB/s]


In [None]:
! unzip TSMC2023Q1.zip

In [57]:
from langchain.document_loaders import PyPDFLoader
lodaer = PyPDFLoader('TSMC2023Q1/20230330 TSMC Citi.pdf')
pages = loader.load_and_split()

In [None]:
pages[0].page_content

In [63]:
from langchain.text_splitter import NLTKTextSplitter
splitter = NLTKTextSplitter(chunk_size = 1000, chunk_overlap = 200 )
chunks = splitter.split_text(pages[0].page_content)



In [None]:
for chunk in chunks:
  print(chunk)
  print('=' * 50 )

In [65]:
from llama_index.node_parser import SimpleNodeParser
parser = SimpleNodeParser(text_splitter = splitter)

## 建立 Document 與 Nodes

In [66]:
from llama_index import Document
docs = Document(pages[0].page_content)
nodes = parser.get_nodes_from_documents([docs])



In [70]:
for node in nodes:
  node.extra_info = {'Document': 'citi'}

## 將 Nodes 插入 Index 中

In [72]:
from llama_index import GPTSimpleVectorIndex
import os
os.environ['OPENAI_API_KEY'] = '<TOKEN>'
index = GPTSimpleVectorIndex([])
index.insert_nodes(nodes)

In [71]:
nodes[0]

Node(text="ab 31 March 2023Global Research and Evidence Lab\nTaiwan Semiconductor Manufacturing\nQ123 earnings preview\nSofter Q2 and 2023 outlook generally priced in\nTSMC's Q123 earnings call is scheduled for 20 April (Thu) after market close.\n\nWe expect \nQ1 earnings to be near the high-end of guidance (sales to fall 12% QoQ in US$ terms  vs \nour previous estimate of a 14% QoQ decline, GM at 55.3% vs 55.0% previously), \nalthough we estimate Q2 sales to decline 8% QoQ, below the implied guidance of a 6% \nQoQ decline provided in mid-January.\n\nNote that we cut our TSMC estimates in late \nFebruary   to factor in the potential downside of high-end smartphones and CPUs \ndemand.\n\nWe are already more conservative on 2023 sales, but now slightly lift to a 1.5% \ndecline in US$ terms from a 2.3% decline to reflect a slightly better Q1.\n\nHowever, we are \ncutting capex estimates to US$32/35bn for 2023/24 from US$33/36bn to factor in the \npotential impact of a more meaningful cycl

## 查詢外資報告資料

In [73]:
index.query('#zh-tw 請問citi 對台積電的看法? 請用正體中文')

Response(response='\n台積電的業績會在H2有所改善，但具體的程度仍然不確定，Citi仍然維持買入評級，並將價格目標定為NT$690.00，並將2023年EPS微調為NT$33.6，2024年EPS保持在NT$40.7。', source_nodes=[NodeWithScore(node=Node(text="For \nsmartphones, we believe Apple has recently trimmed legacy N5/N4 app processor \norders, while MediaTek and Qualcomm may remain cautious ahead of H2.\n\nWhat will the earnings call focus on?\n\n1) TSMC's expectations for the destocking cycle (which the company forecasted to finish \nin H123) and the growth trajectory from H223; 2) HPC visibility and growth potential \nfrom generative AI; 3) a capex update; 4) the profitability outlook, taking account of \noverseas expansion and its US subsidy; 5) N3 ramp and customer migration.\n\nValuation:   maintain Buy rating and price target of NT$690.00\nWe fine-tune 2023E EPS from NT$33.4 to NT$33.6 to reflect the expected slightly better \nQ1 performance, but maintain 2024E EPS at NT$40.7.\n\nAlthough the magnitude of the \npotential H2 recovery is uncertain, we believe consen

In [74]:
print(index.query('#zh-tw 請問citi 對台積電的看法? 請用正體中文').response)


台積電的業績會在H2有所改善，但具體的程度仍然不確定，Citi仍然維持買入評級，並將價格目標定為NT$690.00，並將2023年EPS微調為NT$33.6，2024年EPS保持在NT$40.7。


## 將所有外資報告鍵入索引中

In [92]:
def build_nodes(f):
  loader = PyPDFLoader(f)
  pages = loader.load_and_split()

  splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap=200)
  parser = SimpleNodeParser(text_splitter=splitter)

  docs = Document(pages[0].page_content)
  nodes = parser.get_nodes_from_documents([docs])
  for e in nodes:
      e.extra_info= {'Document':f}
  return nodes

In [94]:
import glob
index = GPTSimpleVectorIndex([])
for f in glob.glob('TSMC2023Q1/*'):
  nodes = build_nodes(f)
  index.insert_nodes(nodes)



## 查詢外資對台積電的看法

In [96]:
print(index.query('#zh-tw 請問JPM對台積電目標價的看法? 請用正體中文回答').response)


JPM對台積電的目標價是NT$650.0。


## 歡迎訂閱按讚分享開啟小鈴鐺

給資料科學家的 Python 基礎課： 
- https://www.youtube.com/watch?v=uzInb5gbl4M 

大數學堂 - 學習資料科學的第一站： 
- https://www.youtube.com/channel/UCSmvtvsTjqkvKLqpmsFWRQw 