## Scrapping IQPlus

Notebook ini men-scrapping link artikel berita yang ada di http://www.iqplus.info/ menggunakan selenium dan menyimpannya dalam file .txt, kemudian menggunakan beautifulSoup untuk mengambil konten teks artikel dari daftar link.

### Import library yang dibutuhkan

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

import requests
from bs4 import BeautifulSoup
import time
import random

### Fungsi untuk scrapping link di IQPlus:

In [2]:
def getlinks(page, category):

    # Setup Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Open the page
    url = "http://www.iqplus.info/news/"+category+"/go-to-page,"+str(page)+".html"
    driver.get(url)

    # Wait for the "STOCK NEWS MORE" section to load
    try:
        stock_news_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//h2[contains(text(),'NEWS MORE')]/following-sibling::ul"))
        )

        # Extract article links
        article_links = set()
        articles = stock_news_section.find_elements(By.TAG_NAME, "a")

        for article in articles:
            link = article.get_attribute("href")
            if link:
                article_links.add(link)

        # Append links to a text file
        with open((category+"_links.txt"), "a", encoding="utf-8") as file:
            for link in article_links:
                file.write(link + "\n")

        print("links from page", page, "have been added to "+category+"_links.txt")

    except Exception as e:
        print(f"Error: {e}")

    finally:
        driver.quit()

### Scrapping link Stock News

In [None]:
for i in range(141, -1, -1):
    getlinks(i, "stock_news")

links from page 141 have been added to stock_news_links.txt
links from page 140 have been added to stock_news_links.txt
links from page 139 have been added to stock_news_links.txt
links from page 138 have been added to stock_news_links.txt
links from page 137 have been added to stock_news_links.txt
links from page 136 have been added to stock_news_links.txt
links from page 135 have been added to stock_news_links.txt
links from page 134 have been added to stock_news_links.txt
links from page 133 have been added to stock_news_links.txt
links from page 132 have been added to stock_news_links.txt
links from page 131 have been added to stock_news_links.txt
links from page 130 have been added to stock_news_links.txt
links from page 129 have been added to stock_news_links.txt
links from page 128 have been added to stock_news_links.txt
links from page 127 have been added to stock_news_links.txt
links from page 126 have been added to stock_news_links.txt
links from page 125 have been added to s

### Scrapping link Market News

In [4]:
for i in range(163, -1, -1):
    getlinks(i, "market_news")

links from page 163 have been added to market_news_links.txt
links from page 162 have been added to market_news_links.txt
links from page 161 have been added to market_news_links.txt
links from page 160 have been added to market_news_links.txt
links from page 159 have been added to market_news_links.txt
Error: Message: 
Stacktrace:
	GetHandleVerifier [0x00B94CA3+225091]
	(No symbol) [0x00AC4DF1]
	(No symbol) [0x00969A7A]
	(No symbol) [0x009A175B]
	(No symbol) [0x009A188B]
	(No symbol) [0x009D7882]
	(No symbol) [0x009BF5A4]
	(No symbol) [0x009D5CB0]
	(No symbol) [0x009BF2F6]
	(No symbol) [0x009979B9]
	(No symbol) [0x0099879D]
	sqlite3_dbdata_init [0x01009A43+4064547]
	sqlite3_dbdata_init [0x0101104A+4094762]
	sqlite3_dbdata_init [0x0100B948+4072488]
	sqlite3_dbdata_init [0x00D0C9A9+930953]
	(No symbol) [0x00AD07C4]
	(No symbol) [0x00ACACE8]
	(No symbol) [0x00ACAE11]
	(No symbol) [0x00ABCA80]
	BaseThreadInitThunk [0x752A5D49+25]
	RtlInitializeExceptionChain [0x7756CE3B+107]
	RtlGetAppCon

### Save Data ke JSON

In [None]:
def savejson(new_data, json_file):
    # Cek apakah file sudah ada
    if os.path.exists(json_file):
        try:
            # Buka dan baca isi file JSON
            with open(json_file, "r", encoding="utf-8") as file:
                data = json.load(file)  
                if not isinstance(data, list):  # Pastikan formatnya list
                    data = []
        except (json.JSONDecodeError, OSError) as e:
            print(f"⚠️ Terjadi kesalahan saat membaca {json_file}: {e}")
            data = []  # Jika file rusak atau error, buat list baru
    else:
        data = []  # Jika file belum ada, buat list baru

    # Tambahkan data baru ke dalam list
    data.append(new_data)

    try:
        # Simpan kembali ke file JSON
        with open(json_file, "w", encoding="utf-8") as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        print(f"✅ Data dengan judul '{new_data['title']}' berhasil disimpan ke {json_file}")
    except OSError as e:
        print(f"❌ Gagal menyimpan data ke {json_file}: {e}")


# ambil data 
def get_data(url, json_file):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    # Retry mechanism for handling connection issues
    for attempt in range(5):  # Retry up to 5 times
        try:
            # Request ke halaman berita
            response = requests.get(url, headers=headers, timeout=10)  # Add timeout to avoid hanging
            
            # Cek jika request berhasil (status code 200)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")

                # Mengambil isi artikel dari <div id="zoomthis">
                content_element = soup.find("div", id="zoomthis")
                if not content_element:
                    print(f"⚠️ Konten tidak ditemukan untuk URL: {url}")
                    return

                content = content_element.get_text(separator="\n").strip()

                # Split the content into lines
                raw = content.split("\n")
                
                # Remove empty lines
                raw = [line for line in raw if line.strip()]

                if len(raw) < 3:
                    print(f"⚠️ Data tidak lengkap di {url}")
                    return

                date = raw[0]
                title = raw[1]
                text = ' '.join(raw[2:]).strip()

                new_data = {
                    "date": date,
                    "title": title,
                    "text": text,
                    "url": url
                }
                
                savejson(new_data, json_file)
                print(f"✅ Data berhasil disimpan untuk: {title}")
                return  # Stop retrying if successful

            else:
                print(f"❌ Request gagal untuk {url} dengan status: {response.status_code}")
                return  # Stop trying if the server rejects the request

        except requests.exceptions.RequestException as e:
            print(f"⚠️ Koneksi gagal untuk {url} (Percobaan {attempt + 1}/5): {e}")
            time.sleep(random.uniform(2, 5))  # Wait before retrying

    print(f"❌ Gagal mengambil data setelah 5 percobaan: {url}")


### Buka daftar link dan scrap artikel lalu masukkan ke JSON

In [None]:
# Open the file and read line by line
with open("market_news_links_master.txt", "r", encoding="utf-8") as file:
    for line in file:
        # print(line.strip())  # Strip removes any extra newlines or spaces
        get_data(line.strip(), "NEW_articles_market_news.json")


In [None]:
# Open the file and read line by line
with open("stock_news_links.txt", "r", encoding="utf-8") as file:
    for line in file:
        # print(line.strip())  # Strip removes any extra newlines or spaces
        get_data(line.strip())
