In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

In [3]:
#Web Crawling
def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def crawl(site, max_depth):
    visited = set()
    to_visit = [(site, 0)]
    news_links = set()

    while to_visit:
        # time.sleep(1)
        current_url, depth = to_visit.pop(0)
        if current_url not in visited and depth <= max_depth:
            try:
                response = requests.get(current_url)
                soup = BeautifulSoup(response.text, 'html.parser')
                visited.add(current_url)

                for link in soup.find_all('a', href=True):
                    href = link['href']
                    full_url = urljoin(current_url, href)
                    if is_valid_url(full_url) and full_url not in visited and "https://news.detik.com/" in full_url:
                        news_links.add(full_url)
                        if depth < max_depth :
                            to_visit.append((full_url, depth + 1))
            except Exception as e:
                print(f"Failed to process {current_url}: {e}")
    
    return list(news_links)


site = 'https://news.detik.com/'
news_links = crawl(site, 3)


print(news_links)


[]


In [90]:
import csv
import os
i = 0
all_news_data = []
for current_link in news_links:
    i+=1
    html_text = requests.get(current_link).text
    soup = BeautifulSoup(html_text, 'lxml')
    #TITLE
    news_title = soup.h1.text.strip()
    
    #FINDING IMAGE
    image_location = soup.find('div', class_='detail__media')  
    if image_location:
        image = image_location.find('img')
        if image:
            img_src = image['src']
        else:
            img_src = "No Image"
    else:
            img_src = "No Image"
            
    #NEWS CONTENT
    # Find the div that contains the news article text
    article_text_div = soup.find('div', class_='detail__body-text itp_bodycontent')

    # Extract all paragraph texts within the div
    article_texts = article_text_div.find_all('p')

    # Combine the texts of all paragraphs into one string
    full_article_text = ' '.join([p.get_text() for p in article_texts])
    
    print(news_title)
    print(i)
    print("=====")
    all_news_data.append([news_title, current_link ,  img_src, full_article_text])



csv_file_path = 'news_data.csv'
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Title', 'News Link', 'Image Link', 'Content'])  
    writer.writerows(all_news_data)


print("CSV created ")

TNI Jelaskan Motif Oknum Prajurit Aniaya Definus Kogoya Anggota KKB
=====
Menpan-RB: Jokowi Minta K/L Segera Integrasikan Layanan pada Portal Nasional
=====
Bagaimana Cuaca RI pada Jumat Agung 29 Maret 2024? Begini Prakiraan BMKG
=====
Momen Jaksa Geledah Rumah Mewah Helena Lim, Ada Brankas Isi Perhiasan
=====
Menteri LHK Dorong Penguatan Peran Polhut Jaga Hutan, Ini 5 Arahannya
=====
Menlu RI Harap Resolusi Gencatan Senjata DK PBB Segera Dijalankan di Gaza
=====
Wapres Ma'ruf-Gus Miftah Isi Ceramah Bukber Jokowi dan Kabinet di Istana
=====
Viral Maling Bercelurit Acak-acak Rumah di Cibinong, Bawa Kabur Uang THR
=====
Polisi Bakal Kawal Pemudik Motor di Merak-Bakauheni dan Ketapang-Gilimanuk
=====
Terungkap Siasat Culas Tersangka Campur BBM dengan Air di Bekasi
=====
Menko Hadi Minta Panglima TNI Hukum Prajurit Terbukti Aniaya KKB Papua
=====
DJKA Siapkan Kuota 12.180 Motor Gratis Mudik via KA
=====
Menko Hadi: Pemerintah Kutuk Serangan Teror di Moskow Rusia
=====
RI Akan Kirim Bantuan

In [1]:
import csv
import requests
import os
import string
import shutil

def clean_filename(filename):
    filename = filename.replace('/', '_')
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    cleaned_filename = ''.join(c for c in filename if c in valid_chars)
    return cleaned_filename

image_directory = 'images/'
if not os.path.exists(image_directory):
    os.makedirs(image_directory)

csv_file = 'news_data.csv'

with open(csv_file, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        news_title = row['Title']
        img_src = row['Image Link']
        filename = clean_filename(news_title.strip()) + '.jpg'
        save_path = os.path.join(image_directory, filename)

        try:
            response = requests.get(img_src, timeout=10)
            if response.status_code == 200:
                with open(save_path, 'wb') as file:
                    file.write(response.content)
            else:
                # shutil.copyfile('blankimage.avif', save_path)
                print(f"Failed to download {img_src} - Status code: {response.status_code}")
        except requests.RequestException as e:
            # shutil.copyfile('blankimage.avif', save_path)
            print(f"Error downloading {img_src} - Error: {e}")
