#### 환경설정

In [1]:
!pip install --quiet langchain
!pip install -q langchain-community

In [2]:
# 사용자 환경 설정
# modulabs통해 발급받은 API-KEY 사용
# 추후 open-ai, gemini KEY 발급시에도 아래처럼 환경 설정 가능

# 01 Colab용 비밀 키 불러오기
from google.colab import userdata
import os
from openai import OpenAI  # openai-python SDK

# 02 OpenAI API 키 설정
api_key = userdata.get("modulabsKey")  # Colab 사용자 환경변수 탭에서 등록한 키
os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI(api_key=api_key)

# 03 버전 확인 (선택)
import openai
import langchain
print("LangChain ver:", langchain.__version__)
print("OpenAI SDK ver:", openai.__version__)

LangChain ver: 0.3.26
OpenAI SDK ver: 1.86.0


#### 01 word불러오기
- Docx2txtLoader

In [3]:
!pip install --upgrade --quiet docx2txt

In [4]:
# [1] Colab 파일 업로드
from google.colab import files
uploaded = files.upload()

# 파일 이름 가져오기
import os
filename = list(uploaded.keys())[0]
print(f"업로드된 파일명: {filename}")

# [2] Docx2txtLoader 불러오기
from langchain_community.document_loaders import Docx2txtLoader  # 최신 버전 기준 (langchain_community로 이동됨)

# [3] Docx 파일 로드 및 분할
loader = Docx2txtLoader(filename)
data = loader.load_and_split()

# [4] 출력 시 textwrap으로 가로폭 제한
import textwrap

print("\n💡 13번째 chunk (최대 500자):\n")
print(textwrap.fill(data[12].page_content[:500], width=100))

Saving [삼성전자] 사업보고서(일반법인) (2021.03.09).docx to [삼성전자] 사업보고서(일반법인) (2021.03.09) (4).docx
업로드된 파일명: [삼성전자] 사업보고서(일반법인) (2021.03.09) (4).docx

💡 13번째 chunk (최대 500자):

2011.09.01      '갤럭시 노트' 공개          2012.04.01      LCD사업부 분사(삼성디스플레이㈜ 설립)          2013.06.26
'갤럭시 S4 LTE-A' LTE-A 스마트폰 출시          2013.08.06      '3D V-NAND' 3차원 수직구조 낸드플래시 메모리 양산
2013.09.24      '아이소셀(ISOCELL)' 차세대 CMOS 이미지센서 개발          2017.03.05      'QLED TV' 진화된 퀀텀닷 기술 적용
TV 출시          2017.03.10      전장 기업 Harman International Industries, Inc.사(100%) 지분 인수 2017.07.04
세계 최대 규모 평택 반도체 라인 가동          2017.11.01      프린팅솔루션 사업 매각          2018.08.30      'QLED 8K TV'
퀀텀닷 기술과 8K 해상도 적용 TV 출시          2018.11.07      '폴더블 디스플레이' 삼성 개발자 컨퍼런스에서 공개          2019.04.03
'갤럭시 S10 5G' 5G 스마트폰 출시          20


In [5]:
# 로드한 워드 파일의 메타데이터 확인
print(data[12].metadata)

{'source': '[삼성전자] 사업보고서(일반법인) (2021.03.09) (4).docx'}


#### 02 CSV파일 불러오기
- 이 순간부터 colab 환경에서 mount 해서 처리하는 것으로 코드 수정

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# 파일경로 /content/drive/MyDrive/03_Modulabs/Modu_LLM/data...

from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path = r"/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/mlb_teams_2012.csv")

data = loader.load()

data[0]

Document(metadata={'source': '/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/mlb_teams_2012.csv', 'row': 0}, page_content='Team: Nationals\n"Payroll (millions)": 81.34\n"Wins": 98')

In [8]:
import os

folder_path = "/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data"
files = os.listdir(folder_path)
print("data 폴더 내 파일 목록:")
for f in files:
    print(f)

data 폴더 내 파일 목록:
BOK 이슈노트 제2022-38호 인공지능 언어모형을 이용한 인플레이션 어조지수 개발 및 시사점.pdf
Copilot-scenarios-for-Marketing.pptx
Transformer_paper.pdf
[삼성전자] 사업보고서(일반법인) (2021.03.09).docx
[이슈리포트 2022-2호] 혁신성장 정책금융 동향.pdf
mlb_teams_2012.csv
state_of_the_union.txt
★육아휴직제도 사용안내서_배포.pdf
대한민국헌법(헌법)(제00010호)(19880225).pdf


#### 03 PPT 파일 불러오기
- UnstructuredPowerPointLoader

In [9]:
# python-pptx 패키지 설치
!pip install --quiet python-pptx

In [27]:
pip install --quiet unstructured

In [11]:
# UnstructuredPowerPointLoader 불러오기

from langchain_community.document_loaders import UnstructuredPowerPointLoader

# mode=elements를 통해 pptx의 요소별로 Document 객체로 가져오기
loader = UnstructuredPowerPointLoader(r"/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/Copilot-scenarios-for-Marketing.pptx", mode="elements")

# pptx 파일을 분할 로드하기
data = loader.load_and_split()

data[1]

Document(metadata={'source': '/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/Copilot-scenarios-for-Marketing.pptx', 'category_depth': 0, 'file_directory': '/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data', 'filename': 'Copilot-scenarios-for-Marketing.pptx', 'last_modified': '2025-06-24T01:36:13', 'page_number': 2, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'category': 'Title', 'element_id': '37991e0f3ff36db43b1ba8379a3657de'}, page_content='Copilot scenarios for\x0bMarketing')

In [12]:
for i in data:
  if i.metadata['page_number'] == 2:
    print(i.metadata['category'])
    print(i.page_content)
    print("\n")

Title
Copilot scenarios forMarketing


Title
Overview and KPIs


Title
Use Case by Role


Title
Day in the Life


NarrativeText
KPIs play a crucial role in organizations, providing a compass to navigate toward success. Let's dive into KPIs for Marketing and how Copilot can assist.


NarrativeText
Copilot can simplify the tasks that execs perform every day. Look at key use cases and how Copilot can be your AI assistant along the way.


NarrativeText
See how real-life marketers are using Copilot in their day to day.




#### 04 인터넷정도 로드, WebBaseLoader
- 랭체인의 WebBaseLoader 활용하면 주어진 문서뿐만 아니라 웹 페이지의 텍스트도 추출 가능함
- 이러한 기능은 특정 웹 사이트의 내용을 기반으로 대화가 가능한 챗봇을 만드는 핵심 기능이 될 수 있음

In [13]:
from langchain_community.document_loaders import WebBaseLoader
# 텍스트 추출할 URL 입력

loader = WebBaseLoader("https://www.espn.com/")

# ssl verification 에러 방지를 위한 코드
loader.requests_kwargs = {'verify':False}

data = loader.load()
data



[Document(metadata={'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}, page_content="\n\n\n\n\n\n\n\n\nESPN - Serving Sports Fans. Anytime. Anywhere.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        Skip to main content\n    \n\n        Skip to navigation\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<\n\n>\n\n\n\n\n\n\n\n\n\nMenuESPN\n\n\n\n\n\nscores\n\n\n\nNFLNBANHLMLBWNBASoccerMMAMore SportsBoxingNCAACricketF1GamingGolfHorseLLWSNASCARNLLNBA G LeagueNBA Summer LeagueNCAAFNCAAMNCAAWNWSLOlympicsPLLProfessional WrestlingRacingRN BBRN FBRugbySports BettingTennisTGLUFLX GamesEditionsFantasyWatchESPN BETESPN+\n\n\n\n\n\n\n\n\n\n\

In [14]:
import bs4

from langchain_community.document_loaders import WebBaseLoader
# 텍스트 추출할 URL 입력

loader = WebBaseLoader("https://www.espn.com/",
                       bs_kwargs = dict(
                           parse_only = bs4.SoupStrainer(
                               class_ = ("headlineStack top-headlines")
                           )
                       ))

In [15]:
# 교재하고 다르게 실습용으로 이 부분이 필요함
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [16]:
# ssl verification 에러 방지를 위한 코드
loader.requests_kwargs = {'verify':False}
data = loader.load()

In [17]:
data

[Document(metadata={'source': 'https://www.espn.com/'}, page_content="Sources: Celtics trading Holiday to Trail BlazersHaliburton on playing injured: 'I'd do it again'Padres' Tatis sues BLA to void future earnings dealRaleigh hits MLB-leading 32nd HR in Mariners' winSources: Mavs to extend Gafford on $60M dealMiami's unbeaten run sets up CWC clash with PSGD-backs' Carroll headed to IL with wrist fracture'We are Vegas' team': A's begin building stadium\uf8ffüîÅ Redrafting MLB draft top-10 picks, 2015-24 Top HeadlinesSources: Celtics trading Holiday to Trail BlazersHaliburton on playing injured: 'I'd do it again'Padres' Tatis sues BLA to void future earnings dealRaleigh hits MLB-leading 32nd HR in Mariners' winSources: Mavs to extend Gafford on $60M dealMiami's unbeaten run sets up CWC clash with PSGD-backs' Carroll headed to IL with wrist fracture'We are Vegas' team': A's begin building stadium\uf8ffüîÅ Redrafting MLB draft top-10 picks, 2015-24")]

##### 여러 개 웹 페이지
- WebBaseLoader는 여러 개의 웹 페이지에서 텍스트 동시 추출 가능

In [18]:
loader = WebBaseLoader(["https://www.espn.com/", "https://google.com"])
docs = loader.load()
docs

[Document(metadata={'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}, page_content="\n\n\n\n\n\n\n\n\nESPN - Serving Sports Fans. Anytime. Anywhere.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        Skip to main content\n    \n\n        Skip to navigation\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<\n\n>\n\n\n\n\n\n\n\n\n\nMenuESPN\n\n\n\n\n\nscores\n\n\n\nNFLNBANHLMLBWNBASoccerMMAMore SportsBoxingNCAACricketF1GamingGolfHorseLLWSNASCARNLLNBA G LeagueNBA Summer LeagueNCAAFNCAAMNCAAWNWSLOlympicsPLLProfessional WrestlingRacingRN BBRN FBRugbySports BettingTennisTGLUFLX GamesEditionsFantasyWatchESPN BETESPN+\n\n\n\n\n\n\n\n\n\n\

#### 05 경로 내의 모든 파일 로드하기
- DirectoryLoader
- 교재와 다르게 unstructured[pdf] 모듈 설치가 필요
  - 최신 기술일수록 라이브러리 버전 dependency 문제 상존하므로 항시 체크할 것!

In [29]:
!pip install --quiet "unstructured[pdf]"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.5/112.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.9/527.9 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [30]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
# 첫 번째 매개변수로 경로 입력, glob에 해당 경로에서 불러들일 파일의 형식 지정
# *는 모든 문자를 표현하는 와일드 카드로, .pdf로 끝나는 모든 파일을 의미함

loader = DirectoryLoader(r"/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/",glob = "*.pdf")
docs = loader.load()
[i.metadata['source'] for i in docs]

['/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/BOK 이슈노트 제2022-38호 인공지능 언어모형을 이용한 인플레이션 어조지수 개발 및 시사점.pdf',
 '/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/Transformer_paper.pdf',
 '/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/[이슈리포트 2022-2호] 혁신성장 정책금융 동향.pdf',
 '/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/★육아휴직제도 사용안내서_배포.pdf',
 '/content/drive/MyDrive/ColabNotebooks/03_Modulabs/Modu_LLM/data/대한민국헌법(헌법)(제00010호)(19880225).pdf']