# Crawling Berita

In [40]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Scraping Berita

In [41]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urlparse, urljoin

def dapatkan_kategori_berita():
    """
    Fungsi untuk mengambil daftar semua kategori berita dari menu navigasi
    website bangsaonline.com.
    """
    print("Mencari kategori berita di bangsaonline.com...")
    kategori_list = {}
    url_home = "https://bangsaonline.com/"
    try:
        response = requests.get(url_home, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        nav_menu = soup.select_one('ul#nav')
        if not nav_menu:
            print("Menu navigasi (ul#nav) tidak ditemukan.")
            return {}

        for item in nav_menu.find_all("a"):
            href = item.get("href")
            nama_kategori = item.get_text(strip=True)
            
            if href and nama_kategori:
                path_parts = urlparse(href).path.strip("/").split("/")
                
                if len(path_parts) == 2 and path_parts[0] == 'kanal':
                    url_lengkap = urljoin(url_home, href)
                    if nama_kategori not in kategori_list:
                        kategori_list[nama_kategori] = url_lengkap

        print(f"Ditemukan {len(kategori_list)} kategori berita valid.")
        
    except requests.exceptions.RequestException as e:
        print(f"Gagal mengambil daftar kategori berita: {e}")
        
    return kategori_list

def scrape_semua_berita():
    """
    Fungsi utama untuk melakukan scraping berita dari semua kategori yang ditemukan.
    """
    daftar_kategori = dapatkan_kategori_berita()

    if not daftar_kategori:
        print("Tidak ada kategori yang bisa di-scrape. Program berhenti.")
        return

    data_berita = []
    scraped_links = set()
    url_home = "https://bangsaonline.com/"

    for nama_kategori, url_kategori in daftar_kategori.items():
        print(f"\n--- Scraping Kategori: {nama_kategori.upper()} ---")
        artikel_diambil = 0
        
        try:
            response_kategori = requests.get(url_kategori, timeout=10)
            response_kategori.raise_for_status()
            soup_kategori = BeautifulSoup(response_kategori.text, "html.parser")

            # --- SELECTOR DIKEMBALIKAN SESUAI PERMINTAAN ---
            list_artikel = soup_kategori.select("h3.entry-title a")
            
            if not list_artikel:
                print("Tidak ada tautan artikel ditemukan di halaman ini.")
                continue

            for artikel in list_artikel:
                # Anda bisa mengubah angka 2 ini jika ingin scrape lebih banyak per kategori
                if artikel_diambil >= 2:
                    break
                
                link_parsial = artikel.get("href")
                if not link_parsial:
                    continue
                
                link = urljoin(url_home, link_parsial)
                
                if link in scraped_links:
                    continue

                scraped_links.add(link)

                try:
                    resp_detail = requests.get(link, timeout=10)
                    resp_detail.raise_for_status()
                    soup_detail = BeautifulSoup(resp_detail.text, "html.parser")

                    # --- SELECTOR DIKEMBALIKAN SESUAI PERMINTAAN ---
                    judul_element = soup_detail.select_one("h1.entry-title")
                    konten_berita = soup_detail.select_one("div.post")
                    
                    if judul_element and konten_berita:
                        judul = judul_element.get_text(strip=True)
                        
                        id_berita = None
                        try:
                            path_parts = urlparse(link).path.strip("/").split("/")
                            if len(path_parts) > 1 and path_parts[1].isdigit():
                                id_berita = path_parts[1]
                        except (IndexError, AttributeError):
                            id_berita = None
                        
                        for unwanted in konten_berita.select("div.baca-juga"):
                            unwanted.decompose()
                        
                        paragraf = [p.get_text(strip=True) for p in konten_berita.select("p")]
                        isi = " ".join(paragraf)

                        if isi:
                            data_berita.append({
                                "id_berita": id_berita,
                                "kategori": nama_kategori,
                                "judul": judul,
                                "isi_berita": isi,
                                "link": link
                            })
                            artikel_diambil += 1
                            print(f"({artikel_diambil}/2) Berhasil scrape: {judul[:60]}...")
                    
                    time.sleep(1)

                except requests.exceptions.RequestException as e:
                    print(f"  -> Gagal mengambil detail dari {link}: {e}")

        except requests.exceptions.RequestException as e:
            print(f"Gagal memproses halaman kategori {url_kategori}: {e}")
            
    if not data_berita:
        print("\nTidak ada berita yang berhasil di-scrape.")
        return

    df = pd.DataFrame(data_berita)
    df = df[["id_berita", "kategori", "judul", "isi_berita", "link"]]
    
    df.to_csv("hasil_scraping_berita.csv", index=False, encoding="utf-8-sig")
    print(f"\n✅ Proses scraping selesai. {len(df)} berita disimpan ke 'hasil_scraping_berita.csv'")
    
    return df

# --- Untuk Menjalankan Seluruh Proses Scraping ---
if __name__ == "__main__":
    df_hasil = scrape_semua_berita()
    if df_hasil is not None:
        pd.set_option('display.max_colwidth', 100)
        print("\nContoh hasil data:")

Mencari kategori berita di bangsaonline.com...
Ditemukan 37 kategori berita valid.

--- Scraping Kategori: JATIM ---
(1/2) Berhasil scrape: Komisi II DPRD Tuban dan PT PRPP Gelar Raker soal CSR Beasis...
(2/2) Berhasil scrape: RLD Gelar Pelatihan Ajarkan Pelaku UMKM Membangun Jejak Digi...

--- Scraping Kategori: JATIM METRO ---
(1/2) Berhasil scrape: Baznas Salurkan Rp300 Juta untuk Rekonstruksi Musala Ponpes ...
(2/2) Berhasil scrape: Senator Lia Istifhama Turut RasakanDuka Para Wali Santri Al ...

--- Scraping Kategori: JATIM TENGAH ---
(1/2) Berhasil scrape: Hadiri Pengukuhan Pengurus Kormi Kota Kediri, Gus Qowim Doro...
(2/2) Berhasil scrape: Teguhkan Komitmen Kebangsaan, Polres Kediri Kota Gelar Upaca...

--- Scraping Kategori: JATIM UTARA ---
(1/2) Berhasil scrape: Polres Tuban Catat 31 Rumah di 2 Desa Rusak Akibat Puting Be...
(2/2) Berhasil scrape: Begini Cara BPJS Kesehatan Bayar Biaya Kesehatan Peserta di ...

--- Scraping Kategori: JATIM SELATAN ---
(1/2) Berhasil scrape: B

In [42]:
df_hasil.head()

Unnamed: 0,id_berita,kategori,judul,isi_berita,link
0,153069,Jatim,Komisi II DPRD Tuban dan PT PRPP Gelar Raker soal CSR Beasiswa,"TUBAN,BANGSAONLINE.com- Komisi II DPRD Tuban bersama sejumlah OPD, PT Pertamina Rosneft Pengolah...",https://bangsaonline.com/berita/153069/komisi-ii-dprd-tuban-dan-pt-prpp-gelar-raker-soal-csr-bea...
1,153068,Jatim,RLD Gelar Pelatihan Ajarkan Pelaku UMKM Membangun Jejak Digital,"SURABAYA,BANGSAONLINE.com- Rumah Literasi Digital (RLD) Surabaya menggelar pelatihan intensif di...",https://bangsaonline.com/berita/153068/rld-gelar-pelatihan-ajarkan-pelaku-umkm-membangun-jejak-d...
2,153086,Jatim Metro,Baznas Salurkan Rp300 Juta untuk Rekonstruksi Musala Ponpes Al Khoziny Sidoarjo,"SURABAYA, BANGSAONLINE.com- Baznas RI menyalurkan bantuan awal sebesar Rp300 juta dan mengerahka...",https://bangsaonline.com/berita/153086/baznas-salurkan-rp300-juta-untuk-rekonstruksi-musala-ponp...
3,153067,Jatim Metro,Senator Lia Istifhama Turut RasakanDuka Para Wali Santri Al Khoziny,"SURABAYA,BANGSAONLINE.com- Bangunan musala lantai tiga Pondok Pesantren Al Khoziny, Siwalan Panj...",https://bangsaonline.com/berita/153067/senator-lia-istifhama-turut-rasakanduka-para-wali-santri-...
4,153053,Jatim Tengah,"Hadiri Pengukuhan Pengurus Kormi Kota Kediri, Gus Qowim Dorong Olahraga untuk Semua Usia","KOTA KEDIRI, BANGSAONLINE.com- Wakil Wali Kota Kediri, Qowimuddin Thoha atau yang akrab disapa G...",https://bangsaonline.com/berita/153053/hadiri-pengukuhan-pengurus-kormi-kota-kediri-gus-qowim-do...


## Preprocesing Berita

In [43]:
import pandas as pd

# Load data from berita
df = pd.read_csv('hasil_scraping_berita.csv')

In [44]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
df

Dataset shape: (74, 5)


Unnamed: 0,id_berita,kategori,judul,isi_berita,link
0,153069,Jatim,Komisi II DPRD Tuban dan PT PRPP Gelar Raker soal CSR Beasiswa,"TUBAN,BANGSAONLINE.com- Komisi II DPRD Tuban bersama sejumlah OPD, PT Pertamina Rosneft Pengolah...",https://bangsaonline.com/berita/153069/komisi-ii-dprd-tuban-dan-pt-prpp-gelar-raker-soal-csr-bea...
1,153068,Jatim,RLD Gelar Pelatihan Ajarkan Pelaku UMKM Membangun Jejak Digital,"SURABAYA,BANGSAONLINE.com- Rumah Literasi Digital (RLD) Surabaya menggelar pelatihan intensif di...",https://bangsaonline.com/berita/153068/rld-gelar-pelatihan-ajarkan-pelaku-umkm-membangun-jejak-d...
2,153086,Jatim Metro,Baznas Salurkan Rp300 Juta untuk Rekonstruksi Musala Ponpes Al Khoziny Sidoarjo,"SURABAYA, BANGSAONLINE.com- Baznas RI menyalurkan bantuan awal sebesar Rp300 juta dan mengerahka...",https://bangsaonline.com/berita/153086/baznas-salurkan-rp300-juta-untuk-rekonstruksi-musala-ponp...
3,153067,Jatim Metro,Senator Lia Istifhama Turut RasakanDuka Para Wali Santri Al Khoziny,"SURABAYA,BANGSAONLINE.com- Bangunan musala lantai tiga Pondok Pesantren Al Khoziny, Siwalan Panj...",https://bangsaonline.com/berita/153067/senator-lia-istifhama-turut-rasakanduka-para-wali-santri-...
4,153053,Jatim Tengah,"Hadiri Pengukuhan Pengurus Kormi Kota Kediri, Gus Qowim Dorong Olahraga untuk Semua Usia","KOTA KEDIRI, BANGSAONLINE.com- Wakil Wali Kota Kediri, Qowimuddin Thoha atau yang akrab disapa G...",https://bangsaonline.com/berita/153053/hadiri-pengukuhan-pengurus-kormi-kota-kediri-gus-qowim-do...
...,...,...,...,...,...
69,153055,Peristiwa,Ada Santri yang Diduga Dihukum untuk Cor Bangunan Musala Ambruk Ponpes Al Khoziny,"SIDOARJO,BANGSAONLINE.com- Proses pencarian korban para santri tertimbun reruntuhan musala ambru...",https://bangsaonline.com/berita/153055/ada-santri-yang-diduga-dihukum-untuk-cor-bangunan-musala-...
70,152633,Teknologi,"Rumor iPhone Fold Terbaru yang Mirip iPhone Air, Bagaimana Spesifikasinya?","SURABAYA, BANGSAONLINE.com- Belakangan ini isu kehadiran iPhone layar lipat semakin kencang bere...",https://bangsaonline.com/berita/152633/rumor-iphone-fold-terbaru-yang-mirip-iphone-air-bagaimana...
71,152577,Teknologi,Tips Menjaga Battery Health iPhone Agar Tidak Cepat Turun,"BANGSAONLINE.com– Bagi kalian para pengguna iPhone, mungkin pernah mengeluhkan baterai iPhone bo...",https://bangsaonline.com/berita/152577/tips-menjaga-battery-health-iphone-agar-tidak-cepat-turun
72,153002,Komunitas dan Lingkungan,"Perkuat Sinergi dan Apresiasi Program Ecopark, PWI Tuban Kunjungi PT Semen Indonesia","TUBAN, BANGSAONLINE.com- PWI Tuban melakukan kunjungan kerja ke PT Semen Indonesia (Persero) Tbk...",https://bangsaonline.com/berita/153002/perkuat-sinergi-dan-apresiasi-program-ecopark-pwi-tuban-k...


In [45]:
print("\nDataset info:")
df.info()


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id_berita   74 non-null     int64 
 1   kategori    74 non-null     object
 2   judul       74 non-null     object
 3   isi_berita  74 non-null     object
 4   link        74 non-null     object
dtypes: int64(1), object(4)
memory usage: 3.0+ KB


In [46]:
# Tampilkan data "isi"
df['isi_berita']

0     TUBAN,BANGSAONLINE.com- Komisi II DPRD Tuban bersama sejumlah OPD, PT Pertamina Rosneft Pengolah...
1     SURABAYA,BANGSAONLINE.com- Rumah Literasi Digital (RLD) Surabaya menggelar pelatihan intensif di...
2     SURABAYA, BANGSAONLINE.com- Baznas RI menyalurkan bantuan awal sebesar Rp300 juta dan mengerahka...
3     SURABAYA,BANGSAONLINE.com- Bangunan musala lantai tiga Pondok Pesantren Al Khoziny, Siwalan Panj...
4     KOTA KEDIRI, BANGSAONLINE.com- Wakil Wali Kota Kediri, Qowimuddin Thoha atau yang akrab disapa G...
                                                     ...                                                 
69    SIDOARJO,BANGSAONLINE.com- Proses pencarian korban para santri tertimbun reruntuhan musala ambru...
70    SURABAYA, BANGSAONLINE.com- Belakangan ini isu kehadiran iPhone layar lipat semakin kencang bere...
71    BANGSAONLINE.com– Bagi kalian para pengguna iPhone, mungkin pernah mengeluhkan baterai iPhone bo...
72    TUBAN, BANGSAONLINE.com- PWI Tuban melak

## Hapus missing Value & Data Duplikat

In [47]:
# Hapus baris dengan Missing Value di 'isi'
df.dropna(subset=['isi_berita'], inplace=True)

# Hapus data duplikat
df.drop_duplicates(inplace=True)

## Cleaning

In [48]:
import re

# Fungsi untuk membersihkan teks
def clean_text(text):
    text = text.lower() # Ubah ke huruf kecil
    text = re.sub(r'[^\w\s]', '', text) # Hapus tanda baca
    text = re.sub(r'\d+', '', text) # Hapus nomor
    return text

# Terapkan pembersihan ke kolom 'isi'
df['cleaned_isi'] = df['isi_berita'].apply(clean_text)

# Tampilkan DataFrame
display(df[['isi_berita', 'cleaned_isi']].head())

Unnamed: 0,isi_berita,cleaned_isi
0,"TUBAN,BANGSAONLINE.com- Komisi II DPRD Tuban bersama sejumlah OPD, PT Pertamina Rosneft Pengolah...",tubanbangsaonlinecom komisi ii dprd tuban bersama sejumlah opd pt pertamina rosneft pengolahan d...
1,"SURABAYA,BANGSAONLINE.com- Rumah Literasi Digital (RLD) Surabaya menggelar pelatihan intensif di...",surabayabangsaonlinecom rumah literasi digital rld surabaya menggelar pelatihan intensif digital...
2,"SURABAYA, BANGSAONLINE.com- Baznas RI menyalurkan bantuan awal sebesar Rp300 juta dan mengerahka...",surabaya bangsaonlinecom baznas ri menyalurkan bantuan awal sebesar rp juta dan mengerahkan tim ...
3,"SURABAYA,BANGSAONLINE.com- Bangunan musala lantai tiga Pondok Pesantren Al Khoziny, Siwalan Panj...",surabayabangsaonlinecom bangunan musala lantai tiga pondok pesantren al khoziny siwalan panji bu...
4,"KOTA KEDIRI, BANGSAONLINE.com- Wakil Wali Kota Kediri, Qowimuddin Thoha atau yang akrab disapa G...",kota kediri bangsaonlinecom wakil wali kota kediri qowimuddin thoha atau yang akrab disapa gus q...


## Tokenisasi

In [49]:
import sys

# Perintah untuk menginstal library menggunakan path Python yang sedang aktif
!{sys.executable} -m pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [50]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') 

# Fungsi untuk melakukan tokenisasi
def tokenize_text(text):
    return word_tokenize(text)

# Terapkan tokenisasi ke kolom 'cleaned_isi'
df['tokenized_isi'] = df['cleaned_isi'].apply(tokenize_text)

# Tampilkan DataFrame dengan kolom hasil tokenisasi
display(df[['cleaned_isi', 'tokenized_isi']].head())

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,cleaned_isi,tokenized_isi
0,tubanbangsaonlinecom komisi ii dprd tuban bersama sejumlah opd pt pertamina rosneft pengolahan d...,"[tubanbangsaonlinecom, komisi, ii, dprd, tuban, bersama, sejumlah, opd, pt, pertamina, rosneft, ..."
1,surabayabangsaonlinecom rumah literasi digital rld surabaya menggelar pelatihan intensif digital...,"[surabayabangsaonlinecom, rumah, literasi, digital, rld, surabaya, menggelar, pelatihan, intensi..."
2,surabaya bangsaonlinecom baznas ri menyalurkan bantuan awal sebesar rp juta dan mengerahkan tim ...,"[surabaya, bangsaonlinecom, baznas, ri, menyalurkan, bantuan, awal, sebesar, rp, juta, dan, meng..."
3,surabayabangsaonlinecom bangunan musala lantai tiga pondok pesantren al khoziny siwalan panji bu...,"[surabayabangsaonlinecom, bangunan, musala, lantai, tiga, pondok, pesantren, al, khoziny, siwala..."
4,kota kediri bangsaonlinecom wakil wali kota kediri qowimuddin thoha atau yang akrab disapa gus q...,"[kota, kediri, bangsaonlinecom, wakil, wali, kota, kediri, qowimuddin, thoha, atau, yang, akrab,..."


## Stopword Removal

In [51]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Dapatkan Stop Word bahasa Indonesia
list_stopwords = set(stopwords.words('indonesian'))

# Fungsi untuk menghapus stop words
def remove_stopwords(tokens):
    return [word for word in tokens if word not in list_stopwords]

# Terapkan penghapusan Stop Word ke kolom 'tokenized_isi'
df['stopwords_removed_isi'] = df['tokenized_isi'].apply(remove_stopwords)

# Tampilkan DataFrame
display(df[['tokenized_isi', 'stopwords_removed_isi']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tokenized_isi,stopwords_removed_isi
0,"[tubanbangsaonlinecom, komisi, ii, dprd, tuban, bersama, sejumlah, opd, pt, pertamina, rosneft, ...","[tubanbangsaonlinecom, komisi, ii, dprd, tuban, opd, pt, pertamina, rosneft, pengolahan, petroki..."
1,"[surabayabangsaonlinecom, rumah, literasi, digital, rld, surabaya, menggelar, pelatihan, intensi...","[surabayabangsaonlinecom, rumah, literasi, digital, rld, surabaya, menggelar, pelatihan, intensi..."
2,"[surabaya, bangsaonlinecom, baznas, ri, menyalurkan, bantuan, awal, sebesar, rp, juta, dan, meng...","[surabaya, bangsaonlinecom, baznas, ri, menyalurkan, bantuan, rp, juta, mengerahkan, tim, baznas..."
3,"[surabayabangsaonlinecom, bangunan, musala, lantai, tiga, pondok, pesantren, al, khoziny, siwala...","[surabayabangsaonlinecom, bangunan, musala, lantai, pondok, pesantren, al, khoziny, siwalan, pan..."
4,"[kota, kediri, bangsaonlinecom, wakil, wali, kota, kediri, qowimuddin, thoha, atau, yang, akrab,...","[kota, kediri, bangsaonlinecom, wakil, wali, kota, kediri, qowimuddin, thoha, akrab, disapa, gus..."


## Stemming

In [52]:
!{sys.executable} -m pip install Sastrawi


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [53]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Buat stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk melakukan stemming
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Terapkan stemming ke kolom 'stopwords_removed_isi'
df['stemmed_isi'] = df['stopwords_removed_isi'].apply(stem_tokens)

# Function to find and format stemmed word changes
def get_stemming_changes(original_tokens, stemmed_tokens):
    changes = []
    for original, stemmed in zip(original_tokens, stemmed_tokens):
        if original != stemmed:
            changes.append(f"{original} : {stemmed}")
    return changes

# Dapatkan semua perubahan stemming dari kolom 'stopwords_removed_isi' dan 'stemmed_isi'
all_changes = []
for index, row in df.iterrows():
    all_changes.extend(get_stemming_changes(row['stopwords_removed_isi'], row['stemmed_isi']))

# Dapatkan perubahan unik dan urutkan
unique_changes = sorted(list(set(all_changes)))

# Cetak perubahan unik
print("Kata-kata yang mengalami stemming:")
for change in unique_changes:
    print(change)

# Tampilkan DataFrame
display(df[['stopwords_removed_isi', 'stemmed_isi']].head())

Kata-kata yang mengalami stemming:
abdian : abdi
acaranya : acara
acuan : acu
adakah : ada
ajakan : aja
ajaran : ajar
akibatnya : akibat
akui : aku
alasan : alas
alasannya : alas
aliran : alir
amalkan : amal
amanahnya : amanah
amankan : aman
ambruknya : ambruk
amiluṣṣāliḥāti : amilu li ti
anaknya : anak
ancaman : ancam
andalan : andal
anehnya : aneh
anggapan : anggap
anggaran : anggar
anggotanya : anggota
angkatan : angkat
anjuran : anjur
apapun : apa
arahan : arah
arahannya : arah
arādū : ar d
asāwira : as wira
atapnya : atap
aturan : atur
aui_ : aui
autentiknya : autentik
ayahnya : ayah
ażābalḥarīqi : a bal ar qi
baginya : bagi
baiknya : baik
bali : bal
balutan : balut
bangkalan : bangkal
bangunan : bangun
bankan : ban
bantuan : bantu
banyaknya : banyak
barakah : bara
barunya : baru
batalnya : batal
batarai : batara
bawaan : bawa
bawahnya : bawah
bayangkan : bayang
bazari : bazar
bebannya : beban
beberkan : kan
belajar : ajar
bepergian : pergi
beragam : agam
beragama : agama
beraktiv

Unnamed: 0,stopwords_removed_isi,stemmed_isi
0,"[tubanbangsaonlinecom, komisi, ii, dprd, tuban, opd, pt, pertamina, rosneft, pengolahan, petroki...","[tubanbangsaonlinecom, komisi, ii, dprd, tuban, opd, pt, pertamina, rosneft, olah, petrokimia, p..."
1,"[surabayabangsaonlinecom, rumah, literasi, digital, rld, surabaya, menggelar, pelatihan, intensi...","[surabayabangsaonlinecom, rumah, literasi, digital, rld, surabaya, gelar, latih, intensif, digit..."
2,"[surabaya, bangsaonlinecom, baznas, ri, menyalurkan, bantuan, rp, juta, mengerahkan, tim, baznas...","[surabaya, bangsaonlinecom, baznas, ri, salur, bantu, rp, juta, kerah, tim, baznas, tanggap, ben..."
3,"[surabayabangsaonlinecom, bangunan, musala, lantai, pondok, pesantren, al, khoziny, siwalan, pan...","[surabayabangsaonlinecom, bangun, musala, lantai, pondok, pesantren, al, khoziny, siwalan, panji..."
4,"[kota, kediri, bangsaonlinecom, wakil, wali, kota, kediri, qowimuddin, thoha, akrab, disapa, gus...","[kota, diri, bangsaonlinecom, wakil, wali, kota, diri, qowimuddin, thoha, akrab, sapa, gus, qowi..."


In [54]:
from collections import Counter

# Gabungkan semua token yang bertangkai menjadi satu daftar
all_stemmed_words = [word for tokens in df['stemmed_isi'] for word in tokens]

# Hitung frekuensi setiap kata
word_frequencies = Counter(all_stemmed_words)

# Menampilkan kata-kata yang paling umum dan frekuensinya
print("Top Most Frequent Words:")
for word, frequency in word_frequencies.most_common():
    print(f"{word}: {frequency}")

Top Most Frequent Words:
indonesia: 112
kiai: 79
rp: 63
jalan: 62
kota: 62
hukum: 58
hadir: 56
bangsaonlinecom: 54
langsung: 50
masyarakat: 50
surabaya: 50
milik: 46
gus: 46
salah: 45
warga: 45
iran: 45
pesantren: 44
bantu: 43
rumah: 43
bangun: 43
dunia: 43
sehat: 43
program: 42
kuat: 42
diri: 42
makan: 41
cepat: 40
negara: 40
layan: 40
pizza: 40
gelar: 39
santri: 39
nu: 39
harga: 38
yahya: 38
kerja: 37
terima: 37
didik: 37
ketua: 37
pilih: 37
kh: 37
tim: 36
angin: 36
klaim: 36
bangsa: 35
asep: 35
harap: 34
tingkat: 34
laku: 34
sakit: 34
timur: 34
bangsaonline: 34
serta: 33
presiden: 33
korban: 33
orang: 33
penuh: 33
tanah: 33
lengkap: 32
hasil: 32
main: 32
kali: 32
restoran: 32
temu: 31
informasi: 31
kabupaten: 30
masuk: 30
kepala: 30
nomor: 30
iphone: 30
tuban: 29
utama: 29
proses: 29
sesuai: 29
usaha: 29
dukung: 29
kenal: 29
tafsir: 29
islam: 29
pasta: 29
giat: 28
jadi: 28
arah: 28
motor: 28
nama: 28
italia: 28
wib: 27
nilai: 27
acara: 27
israel: 27
ubud: 27
rabu: 26
pimpin: 26
kait

In [55]:
# Buat DataFrame baru dengan isi asli, isi stemmed, dan kategori
processed_df = df[['isi_berita', 'stemmed_isi', 'kategori']].copy()

# Ganti nama kolom
processed_df.rename(columns={'stemmed_isi': 'hasil_preprocessing'}, inplace=True)

# Konversi frekuensi kata ke DataFrame
frequency_df = pd.DataFrame.from_dict(word_frequencies, orient='index', columns=['frequency'])
frequency_df.index.name = 'word'
frequency_df.sort_values(by='frequency', ascending=False, inplace=True)

# Simpan ke dua file CSV terpisah
processed_df.to_csv('hasil_preprocessing_berita.csv', index=False, encoding='utf-8')
frequency_df.to_csv('frek_kata_berita.csv', encoding='utf-8')

print("Hasil preprocessing disimpan di 'hasil_preprocessing_berita.csv'")
print("Frekuensi kata disimpan di 'frek_kata_berita.csv'")

Hasil preprocessing disimpan di 'hasil_preprocessing_berita.csv'
Frekuensi kata disimpan di 'frek_kata_berita.csv'


In [56]:
hasil_preprocessing = "hasil_preprocessing_berita.csv"  
df = pd.read_csv(hasil_preprocessing)

# Tampilkan data
df

Unnamed: 0,isi_berita,hasil_preprocessing,kategori
0,"TUBAN,BANGSAONLINE.com- Komisi II DPRD Tuban bersama sejumlah OPD, PT Pertamina Rosneft Pengolah...","['tubanbangsaonlinecom', 'komisi', 'ii', 'dprd', 'tuban', 'opd', 'pt', 'pertamina', 'rosneft', '...",Jatim
1,"SURABAYA,BANGSAONLINE.com- Rumah Literasi Digital (RLD) Surabaya menggelar pelatihan intensif di...","['surabayabangsaonlinecom', 'rumah', 'literasi', 'digital', 'rld', 'surabaya', 'gelar', 'latih',...",Jatim
2,"SURABAYA, BANGSAONLINE.com- Baznas RI menyalurkan bantuan awal sebesar Rp300 juta dan mengerahka...","['surabaya', 'bangsaonlinecom', 'baznas', 'ri', 'salur', 'bantu', 'rp', 'juta', 'kerah', 'tim', ...",Jatim Metro
3,"SURABAYA,BANGSAONLINE.com- Bangunan musala lantai tiga Pondok Pesantren Al Khoziny, Siwalan Panj...","['surabayabangsaonlinecom', 'bangun', 'musala', 'lantai', 'pondok', 'pesantren', 'al', 'khoziny'...",Jatim Metro
4,"KOTA KEDIRI, BANGSAONLINE.com- Wakil Wali Kota Kediri, Qowimuddin Thoha atau yang akrab disapa G...","['kota', 'diri', 'bangsaonlinecom', 'wakil', 'wali', 'kota', 'diri', 'qowimuddin', 'thoha', 'akr...",Jatim Tengah
...,...,...,...
69,"SIDOARJO,BANGSAONLINE.com- Proses pencarian korban para santri tertimbun reruntuhan musala ambru...","['sidoarjobangsaonlinecom', 'proses', 'cari', 'korban', 'santri', 'timbun', 'runtuh', 'musala', ...",Peristiwa
70,"SURABAYA, BANGSAONLINE.com- Belakangan ini isu kehadiran iPhone layar lipat semakin kencang bere...","['surabaya', 'bangsaonlinecom', 'isu', 'hadir', 'iphone', 'layar', 'lipat', 'kencang', 'embus', ...",Teknologi
71,"BANGSAONLINE.com– Bagi kalian para pengguna iPhone, mungkin pernah mengeluhkan baterai iPhone bo...","['bangsaonlinecom', 'guna', 'iphone', 'keluh', 'baterai', 'iphone', 'boros', 'jangka', 'guna', '...",Teknologi
72,"TUBAN, BANGSAONLINE.com- PWI Tuban melakukan kunjungan kerja ke PT Semen Indonesia (Persero) Tbk...","['tuban', 'bangsaonlinecom', 'pwi', 'tuban', 'kunjung', 'kerja', 'pt', 'semen', 'indonesia', 'pe...",Komunitas dan Lingkungan


In [57]:
frekuensi_kata = "frek_kata_berita.csv"  
df = pd.read_csv(frekuensi_kata)

# Tampilkan data
df

Unnamed: 0,word,frequency
0,indonesia,112
1,kiai,79
2,rp,63
3,jalan,62
4,kota,62
...,...,...
4133,bambang,1
4134,wna,1
4135,wni,1
4136,wahyutama,1


## Ektrasi Fitur

In [58]:
import pandas as pd

# Load data from pta_manajemen.csv
df = pd.read_csv('hasil_preprocessing_berita.csv')

In [59]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
df


Dataset shape: (74, 3)


Unnamed: 0,isi_berita,hasil_preprocessing,kategori
0,"TUBAN,BANGSAONLINE.com- Komisi II DPRD Tuban bersama sejumlah OPD, PT Pertamina Rosneft Pengolah...","['tubanbangsaonlinecom', 'komisi', 'ii', 'dprd', 'tuban', 'opd', 'pt', 'pertamina', 'rosneft', '...",Jatim
1,"SURABAYA,BANGSAONLINE.com- Rumah Literasi Digital (RLD) Surabaya menggelar pelatihan intensif di...","['surabayabangsaonlinecom', 'rumah', 'literasi', 'digital', 'rld', 'surabaya', 'gelar', 'latih',...",Jatim
2,"SURABAYA, BANGSAONLINE.com- Baznas RI menyalurkan bantuan awal sebesar Rp300 juta dan mengerahka...","['surabaya', 'bangsaonlinecom', 'baznas', 'ri', 'salur', 'bantu', 'rp', 'juta', 'kerah', 'tim', ...",Jatim Metro
3,"SURABAYA,BANGSAONLINE.com- Bangunan musala lantai tiga Pondok Pesantren Al Khoziny, Siwalan Panj...","['surabayabangsaonlinecom', 'bangun', 'musala', 'lantai', 'pondok', 'pesantren', 'al', 'khoziny'...",Jatim Metro
4,"KOTA KEDIRI, BANGSAONLINE.com- Wakil Wali Kota Kediri, Qowimuddin Thoha atau yang akrab disapa G...","['kota', 'diri', 'bangsaonlinecom', 'wakil', 'wali', 'kota', 'diri', 'qowimuddin', 'thoha', 'akr...",Jatim Tengah
...,...,...,...
69,"SIDOARJO,BANGSAONLINE.com- Proses pencarian korban para santri tertimbun reruntuhan musala ambru...","['sidoarjobangsaonlinecom', 'proses', 'cari', 'korban', 'santri', 'timbun', 'runtuh', 'musala', ...",Peristiwa
70,"SURABAYA, BANGSAONLINE.com- Belakangan ini isu kehadiran iPhone layar lipat semakin kencang bere...","['surabaya', 'bangsaonlinecom', 'isu', 'hadir', 'iphone', 'layar', 'lipat', 'kencang', 'embus', ...",Teknologi
71,"BANGSAONLINE.com– Bagi kalian para pengguna iPhone, mungkin pernah mengeluhkan baterai iPhone bo...","['bangsaonlinecom', 'guna', 'iphone', 'keluh', 'baterai', 'iphone', 'boros', 'jangka', 'guna', '...",Teknologi
72,"TUBAN, BANGSAONLINE.com- PWI Tuban melakukan kunjungan kerja ke PT Semen Indonesia (Persero) Tbk...","['tuban', 'bangsaonlinecom', 'pwi', 'tuban', 'kunjung', 'kerja', 'pt', 'semen', 'indonesia', 'pe...",Komunitas dan Lingkungan


##  Ekstraksi Fitur TF-IDF

In [60]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [61]:
# Kolom teks hasil preprocessing
texts = df["hasil_preprocessing"].astype(str)

# Kalau ada kolom label
labels = df["kategori"] 

##  TF-IDF Vectorization

In [63]:
vectorizer = TfidfVectorizer()

# Fit & transform ke matriks TF-IDF
tfidf_matrix = vectorizer.fit_transform(texts)

# Konversi ke DataFrame agar lebih mudah dibaca
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Tambahkan kolom label
tfidf_df["kategori"] = labels.values

In [64]:
print("Ukuran matriks TF-IDF:", tfidf_df.shape)
tfidf_df

Ukuran matriks TF-IDF: (74, 4122)


Unnamed: 0,aam,abadi,abah,abai,abar,abdi,abdul,abduljabbar,abdullah,abdulloh,...,zakiyah,zaman,zamroni,zero,zilan,zina,zinois,zionis,zohran,zona
0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.10834,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,0.0,0.0,0.0,0.0,0.0,0.00000,0.055428,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
# Simpan ke file CSV
output_file_csv = "hasil_tfidf_berita.csv"
tfidf_df.to_csv(output_file_csv, index=False)

print(f"Hasil TF-IDF berhasil disimpan ke file: {output_file_csv}")

Hasil TF-IDF berhasil disimpan ke file: hasil_tfidf_berita.csv


## CBO Berita

In [66]:
import sys
!{sys.executable} -m pip install gensim


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [67]:
!{sys.executable} -m pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [68]:
import pandas as pd
import numpy as np
import ast # Library untuk mengubah string menjadi list
from gensim.models import Word2Vec
import logging

# --- 1. Persiapan Corpus ---
print("--- TAHAP 1: MEMPERSIAPKAN CORPUS ---")
# Load kembali data Anda (atau lanjutkan dari DataFrame yang sudah ada)
df = pd.read_csv('hasil_preprocessing_berita.csv')

# --- Konversi kolom 'hasil_preprocessing' dari string ke list ---
# Ini adalah langkah penting!
# ast.literal_eval akan membaca string "['a', 'b']" dan mengubahnya menjadi list ['a', 'b']
df['tokens'] = df['hasil_preprocessing'].apply(ast.literal_eval)

# Buat corpus yang siap untuk dilatih
corpus = df['tokens'].tolist()


print("Proses persiapan corpus selesai.")
print("Berikut adalah contoh 1 dokumen (berita) yang sudah diubah menjadi daftar token:")
print(corpus[0])

--- TAHAP 1: MEMPERSIAPKAN CORPUS ---
Proses persiapan corpus selesai.
Berikut adalah contoh 1 dokumen (berita) yang sudah diubah menjadi daftar token:
['tubanbangsaonlinecom', 'komisi', 'ii', 'dprd', 'tuban', 'opd', 'pt', 'pertamina', 'rosneft', 'olah', 'petrokimia', 'prpp', 'gelar', 'rapat', 'kerja', 'raker', 'wali', 'murid', 'terima', 'program', 'beasiswa', 'didik', 'prpp', 'ruang', 'paripurna', 'gedung', 'dprd', 'tuban', 'rabu', 'sore', 'rapat', 'pimpin', 'langsung', 'ketua', 'komisi', 'ii', 'dprd', 'tuban', 'fahmi', 'fikroni', 'serta', 'anggota', 'hadir', 'langsung', 'plt', 'presiden', 'direktur', 'pt', 'prpp', 'sigit', 'pradjaka', 'sugestihanto', 'damping', 'sr', 'officer', 'csr', 'prpp', 'yuli', 'witantra', 'raker', 'gelar', 'wib', 'seledai', 'wib', 'fokus', 'bahas', 'poin', 'utama', 'sol', 'program', 'beasiswa', 'didik', 'tara', 'd', 'pem', 'akamigas', 'yamg', 'salah', 'komitmen', 'prpp', 'kembang', 'masyarakat', 'fahmi', 'fikroni', 'tuntut', 'utama', 'mahasiswa', 'terima', 'be

In [69]:
# --- 2. Melatih Model Word2Vec (dengan Proses Terlihat) ---

print("--- TAHAP 2: MELATIH MODEL WORD2VEC (CBOW) ---")

# Mengaktifkan logging untuk melihat proses training dari Gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

embedding_dim = 150
print(f"Parameter: Dimensi vektor = {embedding_dim}, Arsitektur = CBOW")
print("Gensim akan menampilkan log proses training di bawah ini:")

model_cbow = Word2Vec(
    sentences=corpus,
    vector_size=embedding_dim,
    window=5,
    min_count=2,
    sg=0,
    workers=4
)

print("\nPelatihan model selesai!")
print("\n--- Melihat Hasil Pelatihan Model ---")
# Mengetahui ukuran kosakata yang berhasil dipelajari model
vocab_size = len(model_cbow.wv.index_to_key)
print(f"Model berhasil mempelajari {vocab_size} kata unik.")

# Melihat kata-kata yang paling mirip secara semantik dengan kata tertentu
# Ini membuktikan model sudah belajar konteks
try:
    print("\nContoh kata yang paling mirip dengan 'polisi':")
    print(model_cbow.wv.most_similar('polisi', topn=5))

    print("\nContoh kata yang paling mirip dengan 'surabaya':")
    print(model_cbow.wv.most_similar('surabaya', topn=5))
except KeyError as e:
    print(f"\nKata {e} tidak ditemukan di vocabulary (mungkin karena jarang muncul).")

print("\n" + "="*50 + "\n")

2025-10-02 04:36:12,148 : INFO : collecting all words and their counts
2025-10-02 04:36:12,156 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-10-02 04:36:12,174 : INFO : collected 4138 word types from a corpus of 16970 raw words and 74 sentences
2025-10-02 04:36:12,183 : INFO : Creating a fresh vocabulary
2025-10-02 04:36:12,216 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 2163 unique words (52.27% of original 4138, drops 1975)', 'datetime': '2025-10-02T04:36:12.216837', 'gensim': '4.3.3', 'python': '3.12.1 (main, Jul 10 2025, 11:57:50) [GCC 13.3.0]', 'platform': 'Linux-6.8.0-1030-azure-x86_64-with-glibc2.39', 'event': 'prepare_vocab'}
2025-10-02 04:36:12,224 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 14995 word corpus (88.36% of original 16970, drops 1975)', 'datetime': '2025-10-02T04:36:12.224372', 'gensim': '4.3.3', 'python': '3.12.1 (main, Jul 10 2025, 11:57:50) [GCC 13.3.0]', 'platform': 'Lin

--- TAHAP 2: MELATIH MODEL WORD2VEC (CBOW) ---
Parameter: Dimensi vektor = 150, Arsitektur = CBOW
Gensim akan menampilkan log proses training di bawah ini:


2025-10-02 04:36:12,436 : INFO : estimated required memory for 2163 words and 150 dimensions: 3677100 bytes
2025-10-02 04:36:12,443 : INFO : resetting layer weights
2025-10-02 04:36:12,460 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-10-02T04:36:12.460465', 'gensim': '4.3.3', 'python': '3.12.1 (main, Jul 10 2025, 11:57:50) [GCC 13.3.0]', 'platform': 'Linux-6.8.0-1030-azure-x86_64-with-glibc2.39', 'event': 'build_vocab'}
2025-10-02 04:36:12,462 : INFO : Word2Vec lifecycle event {'msg': 'training model with 4 workers on 2163 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-10-02T04:36:12.462798', 'gensim': '4.3.3', 'python': '3.12.1 (main, Jul 10 2025, 11:57:50) [GCC 13.3.0]', 'platform': 'Linux-6.8.0-1030-azure-x86_64-with-glibc2.39', 'event': 'train'}
2025-10-02 04:36:12,601 : INFO : EPOCH 0: training on 16970 raw words (14744 effective words) took 0.1s, 123833 effective w


Pelatihan model selesai!

--- Melihat Hasil Pelatihan Model ---
Model berhasil mempelajari 2163 kata unik.

Contoh kata yang paling mirip dengan 'polisi':
[('yahya', 0.5794425010681152), ('tingkat', 0.5785065293312073), ('kiai', 0.5765122175216675), ('kuat', 0.5753619074821472), ('rp', 0.5708229541778564)]

Contoh kata yang paling mirip dengan 'surabaya':
[('indonesia', 0.9073991775512695), ('rp', 0.8956034779548645), ('kiai', 0.8930966854095459), ('jalan', 0.8884968161582947), ('milik', 0.8855631947517395)]




In [70]:
# --- 3. Membedah Proses Agregasi Vektor Dokumen ---
print("--- TAHAP 3: MEMBEDAH PROSES AGREGRASI MENJADI VEKTOR DOKUMEN ---")

def create_document_vector(doc, model, num_features):
    word_vectors = [model.wv[word] for word in doc if word in model.wv]
    if not word_vectors:
        return np.zeros(num_features)
    return np.mean(word_vectors, axis=0)

contoh_berita = corpus[0]
print("Kita akan menganalisis berita pertama:")
print(f"Isi berita (token): {contoh_berita}")

print("\nVektor untuk 3 kata pertama dalam berita:")
for i, word in enumerate(contoh_berita[:3]):
    if word in model_cbow.wv:
        print(f"  - Vektor kata '{word}': {model_cbow.wv[word][:5]}... (ditampilkan 5 dimensi pertama)")
    else:
        print(f"  - Kata '{word}' tidak ada di vocabulary model.")

vektor_berita_contoh = create_document_vector(contoh_berita, model_cbow, embedding_dim)
print("\nHasil vektor dokumen (setelah dirata-ratakan):")
print(f"{vektor_berita_contoh[:10]}... (ditampilkan 10 dimensi pertama)")
print(f"Panjang vektor: {len(vektor_berita_contoh)} dimensi (sesuai yang kita tentukan).")

--- TAHAP 3: MEMBEDAH PROSES AGREGRASI MENJADI VEKTOR DOKUMEN ---
Kita akan menganalisis berita pertama:
Isi berita (token): ['tubanbangsaonlinecom', 'komisi', 'ii', 'dprd', 'tuban', 'opd', 'pt', 'pertamina', 'rosneft', 'olah', 'petrokimia', 'prpp', 'gelar', 'rapat', 'kerja', 'raker', 'wali', 'murid', 'terima', 'program', 'beasiswa', 'didik', 'prpp', 'ruang', 'paripurna', 'gedung', 'dprd', 'tuban', 'rabu', 'sore', 'rapat', 'pimpin', 'langsung', 'ketua', 'komisi', 'ii', 'dprd', 'tuban', 'fahmi', 'fikroni', 'serta', 'anggota', 'hadir', 'langsung', 'plt', 'presiden', 'direktur', 'pt', 'prpp', 'sigit', 'pradjaka', 'sugestihanto', 'damping', 'sr', 'officer', 'csr', 'prpp', 'yuli', 'witantra', 'raker', 'gelar', 'wib', 'seledai', 'wib', 'fokus', 'bahas', 'poin', 'utama', 'sol', 'program', 'beasiswa', 'didik', 'tara', 'd', 'pem', 'akamigas', 'yamg', 'salah', 'komitmen', 'prpp', 'kembang', 'masyarakat', 'fahmi', 'fikroni', 'tuntut', 'utama', 'mahasiswa', 'terima', 'beasiswa', 'didik', 'prpp', '

In [71]:
# --- 4. Membuat DataFrame Akhir ---
print("--- TAHAP 4: MEMBUAT DATAFRAME AKHIR ---")
doc_vectors = [create_document_vector(doc, model_cbow, embedding_dim) for doc in corpus]
cbow_df = pd.DataFrame(doc_vectors, columns=[f'dim_{i+1}' for i in range(embedding_dim)])
cbow_df['kategori'] = df['kategori'].values

print("Proses pembuatan DataFrame selesai.")
print("Berikut adalah contoh hasil akhirnya:")
print(cbow_df.head())

--- TAHAP 4: MEMBUAT DATAFRAME AKHIR ---
Proses pembuatan DataFrame selesai.
Berikut adalah contoh hasil akhirnya:
      dim_1     dim_2     dim_3     dim_4     dim_5     dim_6     dim_7  \
0 -0.000084 -0.000751 -0.004719 -0.002588  0.000312 -0.003472  0.000721   
1 -0.000005 -0.001359 -0.004314 -0.002470 -0.000002 -0.003726  0.000913   
2  0.000474 -0.000858 -0.005555 -0.003203  0.000925 -0.004845  0.000137   
3 -0.000333 -0.001256 -0.005020 -0.002545  0.001195 -0.004435 -0.000060   
4 -0.000464 -0.001608 -0.004656 -0.003346 -0.000062 -0.005857 -0.000910   

      dim_8     dim_9    dim_10  ...   dim_142   dim_143   dim_144   dim_145  \
0  0.009129 -0.002807  0.001805  ...  0.002202  0.005687  0.003623  0.007346   
1  0.006680 -0.001844  0.002314  ...  0.001510  0.004840  0.002654  0.005078   
2  0.008784 -0.002908  0.001711  ...  0.002011  0.005840  0.005033  0.007231   
3  0.007910 -0.003420  0.001290  ...  0.000876  0.003926  0.003216  0.005825   
4  0.008381 -0.002382  0.000660  .