##1. requests, beutifulsoup이용한 웹 수집
###1.1. 네이버주가

In [53]:
import requests
from bs4 import BeautifulSoup

# ✅ 삼성전자(005930) 주식 페이지
url = "https://finance.naver.com/item/main.nhn?code=005930"
headers = {"User-Agent": "Mozilla/5.0"}  # User-Agent 넣어야 차단 안됨
res = requests.get(url, headers=headers)
res.raise_for_status()

soup = BeautifulSoup(res.text, "html.parser")

# ✅ 주가 정보 추출
price = soup.select_one("p.no_today span.blind").text  # 현재가
change = soup.select_one("p.no_exday span.blind").text  # 전일 대비
percent = soup.select("p.no_exday span.blind")[-1].text  # 변동률

print("현재가:", price)
print("전일비:", change)
print("등락률:", percent)


현재가: 70,000
전일비: 0
등락률: 0.14


###1.2. 여러 종목

In [54]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

codes = {
    "삼성전자": "005930",
    "현대차": "005380",
    "네이버": "035420",
    "카카오": "035720"
}

headers = {"User-Agent": "Mozilla/5.0"}
results = []

for name, code in codes.items():
    url = f"https://finance.naver.com/item/main.nhn?code={code}"
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    price = soup.select_one("p.no_today span.blind").text
    change = soup.select_one("p.no_exday span.blind").text
    percent = soup.select("p.no_exday span.blind")[-1].text

    results.append({
        "종목명": name,
        "현재가": price,
        "전일비": change,
        "등락률": percent
    })

df = pd.DataFrame(results)
print(df)


    종목명      현재가    전일비   등락률
0  삼성전자   70,000      0  0.14
1   현대차  219,000  2,500  0.92
2   네이버  225,500    500  0.22
3   카카오   65,200    200  0.46


##2. selenium을 활용한 웹 수집


In [11]:
# 필수 패키지
!pip -q install selenium webdriver-manager

# 크롬 설치 (Colab용)
!apt-get update -y
!apt-get install -y wget gnupg unzip
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt-get -fy install

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.83)] [                                                                               Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Reading package lists... Done
W: Skipping acq

###2.1. 네이버뉴스 수집

In [47]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chrome_options = Options()
chrome_options.add_argument("--headless=new")  # 최신 헤드리스 모드
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1366,768")
# 필요시 UA 지정
# chrome_options.add_argument("--user-agent=Mozilla/5.0 ...")

# Colab에 설치된 Chrome 바이너리 명시 (안정)
chrome_options.binary_location = "/usr/bin/google-chrome"

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [48]:
driver.get("https://news.naver.com/section/105")  # IT/과학 뉴스

articles = driver.find_elements(By.CSS_SELECTOR, "div.sa_text")
for a in articles:
    title = a.find_element(By.TAG_NAME, "a").text
    link = a.find_element(By.TAG_NAME, "a").get_attribute("href")
    print(title, link)
driver.quit()


업스테이지, AWS와 미국·아태지역 공공·규제산업 AI 도입 촉진 https://n.news.naver.com/mnews/article/421/0008434628
펄어비스 '붉은사막', 게임스컴 어워드 4개 부문 노미네이트 https://n.news.naver.com/mnews/article/008/0005237516
웹툰을 쇼트폼 애니처럼…네이버웹툰 ‘비디오 에피소드’ 시범 도입 https://n.news.naver.com/mnews/article/032/0003390433
AWS “세계에서 가장 유용한 AI 에이전트 구축 장소될 것” https://n.news.naver.com/mnews/article/009/0005543829
강대임 UST 총장 “매년 5건 학생창업 도전, 제도 손볼 것” https://n.news.naver.com/mnews/article/016/0002516369
 https://n.news.naver.com/mnews/article/056/0012010962
 https://n.news.naver.com/mnews/article/001/0015573201
 https://n.news.naver.com/mnews/article/366/0001101391
“마피아가 뜬다”…페이팔 이어 실리콘밸리 주름잡는 ‘팰런티어’ 경쟁력은 https://n.news.naver.com/mnews/article/009/0005544154
“0명, 이건 너무 심했다” 줄폐업 속출 하더니…6천원 할인에 ‘우르르’ https://n.news.naver.com/mnews/article/016/0002516478
학습지 끊자 태블릿 거치대 값까지 내라? 방통위 나선다 https://n.news.naver.com/mnews/article/006/0000131389
네이버·안랩 이어...'AI 반도체' 손에 쥔 리벨리온도 사우디로 https://n.news.naver.com/mnews/article/469/0000882413
“신호 위반, 진입 금지 무시” 쏟아진 

- 더보기 클릭하여 수집

In [49]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chrome_options = Options()
chrome_options.add_argument("--headless=new")  # 최신 헤드리스 모드
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1366,768")
# 필요시 UA 지정
# chrome_options.add_argument("--user-agent=Mozilla/5.0 ...")

# Colab에 설치된 Chrome 바이너리 명시 (안정)
chrome_options.binary_location = "/usr/bin/google-chrome"

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [50]:
# ✅ IT/과학 뉴스 섹션 (예시)
url = "https://news.naver.com/section/105"
driver.get(url)
time.sleep(3)

# ✅ "더보기" 버튼을 두 번 클릭
for _ in range(2):
    try:
        more_button = driver.find_element(By.CSS_SELECTOR, "a.section_more_inner")
        driver.execute_script("arguments[0].click();", more_button)
        time.sleep(3)  # 클릭 후 로딩 대기
    except:
        print("더보기 버튼 없음 또는 클릭 실패")
        break

# ✅ 기사 제목 + URL 추출
articles = driver.find_elements(By.CSS_SELECTOR, "div.sa_text a.sa_text_title")

data = []
for a in articles:
    title = a.text.strip()
    link = a.get_attribute("href")
    if title and link:
        data.append({"title": title, "link": link})

# ✅ DataFrame 변환 및 저장
df = pd.DataFrame(data)
print(df.head())
df.to_excel("naver_news.xlsx", index=False)
print("엑셀 저장 완료 → naver_news.xlsx")

driver.quit()


                                  title  \
0    글로벌 4000만 게이머 눈도장…게임스컴 전야제 뜬 'K게임'   
1      쇼츠·릴스 잡아라…네이버웹툰 ‘비디오 에피소드’ 시범 도입   
2            "AI 필수 인프라 6G, 정부 적극 지원해야"   
3  AWS "세계에서 가장 신뢰할 수 있는 AI 환경 구축"[AI픽]   
4           강대임 UST 총장 "학생 1000명 증원 목표"   

                                                link  
0  https://n.news.naver.com/mnews/article/003/001...  
1  https://n.news.naver.com/mnews/article/025/000...  
2  https://n.news.naver.com/mnews/article/018/000...  
3  https://n.news.naver.com/mnews/article/001/001...  
4  https://n.news.naver.com/mnews/article/011/000...  
엑셀 저장 완료 → naver_news.xlsx


###2.2. 유튜브 댓글 수집

In [16]:
# 1. 유튜브 영상 접속
url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"  # 영상 ID 부분만 바꿔주면 됨
driver.get(url)
time.sleep(3)

# 2. 영상 제목 추출
title = driver.find_element(By.CSS_SELECTOR, "h1.style-scope.ytd-watch-metadata").text
print("영상 제목:", title)

# 3. 댓글 영역으로 스크롤
driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
time.sleep(3)

# 여러 번 스크롤 해서 댓글 더 가져오기
last_height = driver.execute_script("return document.documentElement.scrollHeight")
for _ in range(10):  # 10번 스크롤 → 댓글 많이 로드됨
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
    time.sleep(2)
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# 4. 댓글 수집
comments_data = []
comments = driver.find_elements(By.CSS_SELECTOR, "#content #content-text")
authors = driver.find_elements(By.CSS_SELECTOR, "#author-text span")
likes = driver.find_elements(By.CSS_SELECTOR, "#vote-count-middle")

for author, comment, like in zip(authors, comments, likes):
    comments_data.append({
        "author": author.text,
        "comment": comment.text,
        "likes": like.text
    })

# 5. DataFrame 변환 후 저장
df = pd.DataFrame(comments_data)
print(df.head())

df.to_excel("youtube_comments_colab.xlsx", index=False)
print("엑셀 저장 완료: youtube_comments_colab.xlsx")

driver.quit()

영상 제목: Rick Astley - Never Gonna Give You Up (Official Video) (4K Remaster)
          author                                            comment likes
0                                  can confirm: he never gave us up   90K
1                 Nobody can fool me anymore with this, I now re...  114K
2    @Siuraneta5  Vote Rick Astley for president\nHe will never:...  183K
3                 I didn't get rickrolled today, I just really e...  334K
4  @georgeealien  man I just got legitimately rickrolled right n...   425
엑셀 저장 완료: youtube_comments_colab.xlsx


In [None]:
# 크롬드라이버 실행 (코랩에 설치된 경로 사용)
chrome_options = Options()
chrome_options.add_argument("--headless=new")  # 최신 헤드리스 모드
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1366,768")
# 필요시 UA 지정
# chrome_options.add_argument("--user-agent=Mozilla/5.0 ...")

# Colab에 설치된 Chrome 바이너리 명시 (안정)
chrome_options.binary_location = "/usr/bin/google-chrome"

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [20]:
# 여러 개의 유튜브 영상 URL 리스트
video_urls = [
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
    "https://www.youtube.com/watch?v=3JZ_D3ELwOQ",
    "https://www.youtube.com/watch?v=kJQP7kiw5Fk"
]

#  전체 결과 저장용 리스트
all_comments = []

for url in video_urls:
    print(f"영상 처리 중: {url}")
    driver.get(url)
    time.sleep(3)

    # 영상 제목
    title = driver.find_element(By.CSS_SELECTOR, "h1.style-scope.ytd-watch-metadata").text

    # 댓글 영역으로 스크롤
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
    time.sleep(3)

    # 무한 스크롤 (댓글 더 불러오기)
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    for _ in range(5):  # 반복 횟수 늘리면 더 많은 댓글 수집 가능
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(2)
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # 댓글 수집
    comments = driver.find_elements(By.CSS_SELECTOR, "#content #content-text")
    authors = driver.find_elements(By.CSS_SELECTOR, "#author-text span")
    likes = driver.find_elements(By.CSS_SELECTOR, "#vote-count-middle")

    for author, comment, like in zip(authors, comments, likes):
        all_comments.append({
            "video_url": url,
            "video_title": title,
            "author": author.text,
            "comment": comment.text,
            "likes": like.text
        })

#  DataFrame으로 변환
df = pd.DataFrame(all_comments)
print(df.head())

#  엑셀 저장
df.to_excel("youtube_multi_comments.xlsx", index=False)
print("엑셀 저장 완료: youtube_multi_comments.xlsx")

driver.quit()


▶ 영상 처리 중: https://www.youtube.com/watch?v=dQw4w9WgXcQ
▶ 영상 처리 중: https://www.youtube.com/watch?v=3JZ_D3ELwOQ
▶ 영상 처리 중: https://www.youtube.com/watch?v=kJQP7kiw5Fk
                                     video_url           video_title  \
0  https://www.youtube.com/watch?v=3JZ_D3ELwOQ  Flexin' On Ya (2014)   
1  https://www.youtube.com/watch?v=3JZ_D3ELwOQ  Flexin' On Ya (2014)   
2  https://www.youtube.com/watch?v=3JZ_D3ELwOQ  Flexin' On Ya (2014)   
3  https://www.youtube.com/watch?v=3JZ_D3ELwOQ  Flexin' On Ya (2014)   
4  https://www.youtube.com/watch?v=3JZ_D3ELwOQ  Flexin' On Ya (2014)   

                 author                                            comment  \
0            @XMan2001X                      He was super Kangaroo Jacked!   
1       @crimsonnva2599                            Arkansas don't play man   
2              @r4m3n4u                  kangaroo listens to NBA young boy   
3  @christinaarnold4106                             Who’s watching in 2001   
4           