## Crawl Detik

Program ini, "Detik News Scraper", dirancang untuk mengambil informasi dari situs detik.com terkait topik berita yang diinginkan. Pengguna memasukkan topik, program mengambil URL berita, dan kemudian ekstrak informasi seperti judul, penulis, tanggal, dan isi berita.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

class Detik:
    def __init__(self, topic):
        self.topic = topic
        self.df = None  # Inisialisasi DataFrame sebagai None

    def get_urls(self):
        news_links = []
        # get news URL from page 1
        page = 1
        url = f"https://www.detik.com/search/searchall?query={self.topic}&siteid=2&sortby=time&page={page}"
        html_page = requests.get(url).content
        soup = BeautifulSoup(html_page, 'lxml')
        articles = soup.find_all('article')

        # Ambil hanya satu URL berita, jika ada
        if articles:
            url = articles[0].find('a')['href']
            news_links.append(url)

        return news_links

    def has_link(self, text):
        # Fungsi untuk memeriksa apakah teks mengandung tautan
        return 'href=' in text

    def extract_news(self):
        # get news article details from scraped URLs
        scraped_info = []
        for news in self.get_urls():
            source = news
            html_page = requests.get(news).content
            soup = BeautifulSoup(html_page, 'lxml')
            # check if title, author, date, news div, is not None type
            title = soup.find('h1', class_='detail__title')
            if title is not None:
                title = title.text
                title = title.replace('\n', '')
                title = title.strip()

            author = soup.find('div', class_='detail__author')
            if author is not None:
                author = author.text

            date = soup.find('div', class_='detail__date')
            if date is not None:
                date = date.text

            # Ambil isi berita dari div dengan class 'detail__body-text itp_bodycontent'
            content_div = soup.find("div", {"class": "detail__body-text itp_bodycontent"})
            if content_div:
                # Hilangkan elemen-elemen <a> yang merupakan tautan
                for a_tag in content_div.find_all(self.has_link):
                    a_tag.decompose()

                # Ambil teks dari div
                news_content = ' '.join(content_div.stripped_strings)

                # convert scraped data into a dictionary
                news_data = {
                    "url": source,
                    "judul": title,
                    "penulis": author,
                    "tanggal": date,
                    "isi": news_content
                }
                # add dictionaries to a list
                scraped_info.append(news_data)

        self.df = pd.DataFrame.from_dict(scraped_info)
        self.df.to_csv(f'{self.topic}.csv', index=False)  # Simpan DataFrame ke dalam file CSV

# Input topik berita
topic = input("Masukkan topik berita yang ingin diambil: ")
detik_crawler = Detik(topic)
detik_crawler.extract_news()

# Setelah DataFrame df dibuat dalam metode extract_news, Anda dapat mengaksesnya di sini
detik_crawler.df


Masukkan topik berita yang ingin diambil: olahraga


Unnamed: 0,url,judul,penulis,tanggal,isi
0,https://www.detik.com/jatim/berita/d-7047627/s...,SBY Sebut Persahabatannya dengan Prabowo Terja...,Faiq Azmi - detikJatim,"Senin, 20 Nov 2023 20:36 WIB",Madiun - Ketua Majelis Tinggi Partai Demokrat ...


## Ekstraksi Kalimat

In [13]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import string
import re

nltk.download('punkt')
nltk.download('stopwords')

# Fungsi untuk melakukan tokenisasi kata, konversi ke huruf kecil, dan menghapus tanda baca dan angka
def extract_and_preprocess_words(text):
    stop_words = set(stopwords.words('indonesian'))
    words = word_tokenize(text)
    preprocessed_words = [word.lower() for word in words if (word.isalpha() and word.lower() not in stop_words)]

    return preprocessed_words

# Buat DataFrame baru untuk kata-kata
kata_df = pd.DataFrame(columns=['berita_id', 'kata'])

# Ekstrak dan proses setiap berita
for i in range(len(detik_crawler.df)):
    berita_id = i
    berita = detik_crawler.df.loc[i, 'isi']
    kata_berita = extract_and_preprocess_words(berita)

    # Tambahkan setiap kata ke DataFrame
    for kata in kata_berita:
        kata_df = pd.concat([kata_df, pd.DataFrame({'berita_id': [berita_id], 'kata': [kata]})], ignore_index=True)

# Tampilkan DataFrame dengan kolom 'berita_id' dan 'kata'
print(kata_df[['berita_id', 'kata']])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


    berita_id      kata
0           0    madiun
1           0     ketua
2           0   majelis
3           0    partai
4           0  demokrat
..        ...       ...
207         0   pilpres
208         0       sby
209         0   prabowo
210         0   politik
211         0    madiun

[212 rows x 2 columns]


## TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inisialisasi objek TfidfVectorizer untuk token kata
tfidf_vectorizer_kata = TfidfVectorizer(tokenizer=None, preprocessor=None)  # Sesuaikan max_features sesuai kebutuhan

# Transforms kata menjadi matriks TF-IDF
tfidf_matrix_kata = tfidf_vectorizer_kata.fit_transform(kata_df['kata'])

# Dapatkan daftar fitur (kata-kata) yang terkandung dalam TF-IDF
feature_names_kata = tfidf_vectorizer_kata.get_feature_names_out()

# Buat DataFrame dari matriks TF-IDF dengan nama fitur sebagai kolom
tfidf_df_kata = pd.DataFrame(tfidf_matrix_kata.toarray(), columns=feature_names_kata)

# Tampilkan DataFrame TF-IDF
print(tfidf_df_kata)


     acara  advertisement  ahy  air  akabri  akademi  aksi  aktif  aston  \
0      0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
1      0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
2      0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
3      0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
4      0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
..     ...            ...  ...  ...     ...      ...   ...    ...    ...   
207    0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
208    0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
209    0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
210    0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   
211    0.0            0.0  0.0  0.0     0.0      0.0   0.0    0.0    0.0   

     baca  ...  timur  titip  tni   to  tugas  turun  video  wilayah  with  \
0     0.0

## Matrix Cosinus Similarity

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Ambil matriks TF-IDF dari DataFrame untuk kata
tfidf_matrix_kata = tfidf_df_kata.values

# Hanya ambil kata-kata yang muncul dalam setidaknya satu dokumen
non_empty_words = tfidf_df_kata.columns[tfidf_df_kata.sum(axis=0) > 0]
tfidf_matrix_kata_filtered = tfidf_df_kata[non_empty_words].values

# Hitung kemiripan kosinus antara kata-kata
cosine_similarity_kata = cosine_similarity(tfidf_matrix_kata_filtered, tfidf_matrix_kata_filtered)

# Buat DataFrame dari matriks kemiripan kosinus
cosine_similarity_df_kata = pd.DataFrame(cosine_similarity_kata, columns=non_empty_words, index=non_empty_words)


ValueError: ignored

## Membentuk Graph

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Buat graf
G_kata = nx.Graph()

# Tambahkan simpul (node) untuk setiap kata
for kata in feature_names_kata:
    G_kata.add_node(kata, label=kata)

# Tambahkan tepi (edge) antara kata-kata berdasarkan skor kemiripan kosinus
for i, kata_i in enumerate(feature_names_kata):
    for j, kata_j in enumerate(feature_names_kata):
        if i < j:  # Hindari menghitung skor untuk diri sendiri dan duplikat
            similarity_score = cosine_similarity_df_kata.loc[kata_i, kata_j]
            G_kata.add_edge(kata_i, kata_j, weight=similarity_score)

# Dapatkan label dari simpul-simpul
labels_kata = nx.get_node_attributes(G_kata, 'label')

# Gambar graf kata-kata
pos_kata = nx.spring_layout(G_kata)
weights_kata = [G_kata[u][v]['weight'] for u, v in G_kata.edges]
nx.draw(G_kata, pos_kata, with_labels=True, labels=labels_kata, width=weights_kata, edge_cmap=plt.cm.viridis, node_color='skyblue')

plt.show()

## Closeness, Eigenvector, Pagerank, Betweenness Centrality

In [None]:
closeness_centrality = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G)
pagerank_centrality = nx.pagerank(G)
betweenness_centrality = nx.pagerank(G)

## Closeness Centrality

In [None]:
# Tampilkan closeness centrality dari nilai tertinggi ke terendah
sorted_closeness = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)

print("Closeness Similarity Scores (Dari Tertinggi ke Terendah):")
for node, score in sorted_closeness:
    print(f"{node}: {score}")

# Tampilkan 3 kalimat teratas dari nilai tertinggi
top_3_nodes = sorted_closeness[:3]

print("Tiga Kalimat Teratas dari Nilai Tertinggi Closeness Centrality:")
for node, _ in top_3_nodes:
    kalimat_index = int(node.split("#")[1])  # Mendapatkan indeks kalimat dari simpul
    kalimat = kalimat_df['kalimat'][kalimat_index]
    print(f"Kalimat #{kalimat_index}: {kalimat}")


## Pagerank Centrality

In [None]:
sorted_pagerank = sorted(pagerank_centrality.items(), key=lambda x: x[1], reverse=True)

print("Pagerank Scores (Dari Tertinggi ke Terendah):")
for node, score in sorted_pagerank:
    print(f"{node}: {score}")

# Tampilkan 3 kalimat teratas dari nilai tertinggi
top_3_nodes = sorted_pagerank[:3]

print("Tiga Kalimat Teratas dari Nilai Tertinggi Closeness Centrality:")
for node, _ in top_3_nodes:
    kalimat_index = int(node.split("#")[1])  # Mendapatkan indeks kalimat dari simpul
    kalimat = kalimat_df['kalimat'][kalimat_index]
    print(f"Kalimat #{kalimat_index}: {kalimat}")

## Eigenvector Centrality

In [None]:
sorted_eigenvector = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)

print("eigenvector Scores (Dari Tertinggi ke Terendah):")
for node, score in sorted_eigenvector:
    print(f"{node}: {score}")

# Tampilkan 3 kalimat teratas dari nilai tertinggi
top_3_nodes = sorted_pagerank[:3]

print("Tiga Kalimat Teratas dari Nilai Tertinggi Closeness Centrality:")
for node, _ in top_3_nodes:
    kalimat_index = int(node.split("#")[1])  # Mendapatkan indeks kalimat dari simpul
    kalimat = kalimat_df['kalimat'][kalimat_index]
    print(f"Kalimat #{kalimat_index}: {kalimat}")

## Betweenes Centrality

In [None]:
sorted_betweeness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)

print("eigenvector Scores (Dari Tertinggi ke Terendah):")
for node, score in sorted_betweeness:
    print(f"{node}: {score}")

# Tampilkan 3 kalimat teratas dari nilai tertinggi
top_3_nodes = sorted_betweeness[:3]

print("Tiga Kalimat Teratas dari Nilai Tertinggi Closeness Centrality:")
for node, _ in top_3_nodes:
    kalimat_index = int(node.split("#")[1])  # Mendapatkan indeks kalimat dari simpul
    kalimat = kalimat_df['kalimat'][kalimat_index]
    print(f"Kalimat #{kalimat_index}: {kalimat}")


