In [15]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [16]:
def get_google_news_url(keywords, days):
    url = "https://news.google.com/search?q="
    for k in keywords:
        url += k
        url += "+"
    url += "when:"
    url += str(days)
    url += "d"
    return url

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_news(symbols, urls):
    tickers = symbols
    news_urls = urls
    all_news = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for i in range(len(tickers)):
        ticker = tickers[i]
        news_url = news_urls[i]

        try:
            page = requests.get(news_url, headers=headers).text
            soup = BeautifulSoup(page, 'html.parser')

            # Debug: print URL untuk cek
            print(f"Fetching: {news_url[:80]}...")

            # Coba beberapa selector (Google sering berubah)
            # Selector 1: artikel news
            articles = soup.select('article')

            for article in articles:
                # Cari judul
                title_elem = article.select_one('a[href*="./articles/"]') or \
                             article.select_one('h3') or \
                             article.select_one('h4') or \
                             article.select_one('[class*="title"]')

                # Cari waktu
                time_elem = article.select_one('time') or \
                            article.select_one('[datetime]')

                if title_elem:
                    title = title_elem.get_text(strip=True)
                    timedate = time_elem.get('datetime', 'N/A') if time_elem else 'N/A'

                    if title:  # hanya tambah jika ada judul
                        all_news.append((ticker, timedate, title))

            print(f"  Found {len([n for n in all_news if n[0]==ticker])} articles for {ticker}")

        except Exception as e:
            print(f"Error fetching {ticker}: {e}")

    news = pd.DataFrame(all_news, columns=['Ticker', 'Date', 'Headline'])

    # Hapus duplikat jika ada
    news = news.drop_duplicates(subset=['Headline'])

    return news


# ============ ALTERNATIF: Gunakan Google News RSS ============
def get_google_news_url_rss(keywords, days=7):
    """Generate Google News RSS URL"""
    if isinstance(keywords, list):
        query = "+".join(keywords)
    else:
        query = keywords.replace(" ", "+")

    return f"https://news.google.com/rss/search?q={query}+when:{days}d&hl=id&gl=ID&ceid=ID:id"


def get_news_rss(symbols, urls):
    """Scrape news menggunakan RSS (lebih stabil)"""
    all_news = []

    for i in range(len(symbols)):
        ticker = symbols[i]
        news_url = urls[i]

        try:
            page = requests.get(news_url).text
            soup = BeautifulSoup(page, 'xml')

            items = soup.find_all('item')

            for item in items:
                title = item.title.text if item.title else ''
                pubdate = item.pubDate.text if item.pubDate else 'N/A'

                if title:
                    all_news.append((ticker, pubdate, title))

            print(f"{len(items)} news from {ticker}")

        except Exception as e:
            print(f"Error fetching {ticker}: {e}")

    news = pd.DataFrame(all_news, columns=['Ticker', 'Date', 'Headline'])
    return news

In [21]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [22]:
!pip install lxml



In [20]:
# Saham-saham populer Indonesia (IDX)
symbols = ["BBCA", "BBRI", "TLKM", "ASII", "BMRI"]

# Keyword bisa berupa list of words atau string
# Angka 7 = berita dari 7 hari terakhir
news_url = [
    get_google_news_url(["Bank BCA saham"], 7),
    get_google_news_url(["Bank BRI saham"], 7),
    get_google_news_url(["Telkom Indonesia saham"], 7),
    get_google_news_url(["Astra International saham"], 7),
    get_google_news_url(["Bank Mandiri saham"], 7)
]

# Pastikan jumlah symbols = jumlah news_url
print(f"Symbols: {len(symbols)}, URLs: {len(news_url)}")

All_News = get_news(symbols, news_url)
print(All_News)

Symbols: 5, URLs: 5
Fetching: https://news.google.com/search?q=Bank BCA saham+when:7d...
  Found 0 articles for BBCA
Fetching: https://news.google.com/search?q=Bank BRI saham+when:7d...
  Found 0 articles for BBRI
Fetching: https://news.google.com/search?q=Telkom Indonesia saham+when:7d...
  Found 0 articles for TLKM
Fetching: https://news.google.com/search?q=Astra International saham+when:7d...
  Found 0 articles for ASII
Fetching: https://news.google.com/search?q=Bank Mandiri saham+when:7d...
  Found 0 articles for BMRI
Empty DataFrame
Columns: [Ticker, Date, Headline]
Index: []


In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_google_news_url_rss(keywords, days=7):
    """Generate Google News RSS URL"""
    if isinstance(keywords, list):
        query = "+".join(keywords)
    else:
        query = keywords.replace(" ", "+")

    # RSS URL dengan bahasa Indonesia
    return f"https://news.google.com/rss/search?q={query}+when:{days}d&hl=id&gl=ID&ceid=ID:id"


def get_news_rss(symbols, urls):
    """Scrape news menggunakan RSS (lebih stabil)"""
    all_news = []

    for i in range(len(symbols)):
        ticker = symbols[i]
        news_url = urls[i]

        try:
            response = requests.get(news_url)
            soup = BeautifulSoup(response.content, 'xml')

            items = soup.find_all('item')

            for item in items:
                title = item.title.text if item.title else ''
                pubdate = item.pubDate.text if item.pubDate else 'N/A'
                link = item.link.text if item.link else ''

                if title:
                    all_news.append((ticker, pubdate, title, link))

            print(f"{len(items)} news from {ticker}")

        except Exception as e:
            print(f"Error fetching {ticker}: {e}")

    news = pd.DataFrame(all_news, columns=['Ticker', 'Date', 'Headline', 'Link'])
    return news


# ========== JALANKAN ==========
symbols = ["BBCA", "BBRI", "TLKM", "ASII", "BMRI"]

news_url = [
    get_google_news_url_rss("Bank BCA saham", 7),
    get_google_news_url_rss("Bank BRI saham", 7),
    get_google_news_url_rss("Telkom Indonesia saham", 7),
    get_google_news_url_rss("Astra International saham", 7),
    get_google_news_url_rss("Bank Mandiri saham", 7)
]

print("URLs generated:")
for i, url in enumerate(news_url):
    print(f"  {symbols[i]}: {url}")
print()

All_News = get_news_rss(symbols, news_url)
print(f"\nTotal: {len(All_News)} articles")
print(All_News)

# ========== SIMPAN KE CSV ==========
filename = "indonesia_market_news.csv"
All_News.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"\n✅ Data tersimpan ke: {filename}")

URLs generated:
  BBCA: https://news.google.com/rss/search?q=Bank+BCA+saham+when:7d&hl=id&gl=ID&ceid=ID:id
  BBRI: https://news.google.com/rss/search?q=Bank+BRI+saham+when:7d&hl=id&gl=ID&ceid=ID:id
  TLKM: https://news.google.com/rss/search?q=Telkom+Indonesia+saham+when:7d&hl=id&gl=ID&ceid=ID:id
  ASII: https://news.google.com/rss/search?q=Astra+International+saham+when:7d&hl=id&gl=ID&ceid=ID:id
  BMRI: https://news.google.com/rss/search?q=Bank+Mandiri+saham+when:7d&hl=id&gl=ID&ceid=ID:id

44 news from BBCA
42 news from BBRI
49 news from TLKM
39 news from ASII
57 news from BMRI

Total: 231 articles
    Ticker                           Date  \
0     BBCA  Wed, 19 Nov 2025 23:30:00 GMT   
1     BBCA  Thu, 20 Nov 2025 01:05:51 GMT   
2     BBCA  Mon, 17 Nov 2025 22:17:03 GMT   
3     BBCA  Sun, 16 Nov 2025 06:19:00 GMT   
4     BBCA  Thu, 20 Nov 2025 06:09:00 GMT   
..     ...                            ...   
226   BMRI  Tue, 18 Nov 2025 10:19:02 GMT   
227   BMRI  Fri, 14 Nov 2025 10:18