# CRAWLING PTA MANAJEMEN & Berita

In [2]:
!pip install builtwith
!pip install nltk
!pip install Sastrawi
!pip install pyspellchecker

Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: builtwith
  Building wheel for builtwith (pyproject.toml): started
  Building wheel for builtwith (pyproject.toml): finished with status 'done'
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36133 sha256=ca02abe2427ec9929d20aea7005a5c9ce1ebaf626727aeb8f726ea92bc3d390f
  Stored in directory: c:\users\raihan fadillah\appdata\local\pip\cache\wheels\ac\22\54\51e70a84f60595e3a31a30ecdde2438d1f76007d17ab8c9270
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4
Collecting nltk


## 1. Crawling PTA

In [1]:
import builtwith

# Analisis teknologi yang digunakan
res = builtwith.parse('https://pta.trunojoyo.ac.id')
print(res)

{'web-servers': ['Nginx'], 'javascript-frameworks': ['jQuery', 'jQuery UI']}


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys

# --- Fungsi Bantu Scraping ---
def get_text_or_na(soup, selectors):
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            text = element.get_text(strip=True)
            if text:
                return text.strip()
    return 'N/A'

def get_data_from_span(soup, text_contains):
    span = soup.select_one(f'span:-soup-contains("{text_contains}")')
    if span:
        text = span.get_text(strip=True)
        if ':' in text:
            return text.split(':', 1)[1].strip()
    return 'N/A'

def get_abstract_robust(soup, keywords):
    for tag_b in soup.find_all('b'):
        tag_text = tag_b.get_text(strip=True)
        if any(keyword.lower() in tag_text.lower() for keyword in keywords):
            parent_div = tag_b.find_parent('div')
            if parent_div:
                next_div_sibling = parent_div.find_next_sibling('div')
                if next_div_sibling:
                    p_tag = next_div_sibling.find('p', align="justify")
                    if p_tag:
                        return p_tag.get_text(strip=True)
    return 'N/A'

def get_total_pages(soup):
    try:
        pagination = soup.select_one('ol.pagination')
        if pagination:
            last_page_li = pagination.select('li')[-1]
            last_page_link = last_page_li.select_one('a')
            if last_page_link and 'href' in last_page_link.attrs:
                return int(last_page_link['href'].split('/')[-1])
    except:
        pass
    return 1

# --- Fungsi Scraping ---
def scrape_manajemen_raw():
    prodi_data = [{'name': 'Manajemen', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/7'}]
    all_data = {"penulis": [], "judul": [], "pembimbing_pertama": [], "pembimbing_kedua": [],
                "abstrak_indonesia": [], "abstrak_inggris": [], "prodi": []}

    total_data_count = 0
    print("--- MULAI SCRAPING (RAW DATA) ---")

    for prodi in prodi_data:
        first_page_url = prodi['url']
        try:
            r = requests.get(first_page_url)
            r.raise_for_status()
            soup = BeautifulSoup(r.content, "html.parser")
            total_pages = get_total_pages(soup)
            print(f"Ditemukan {total_pages} halaman untuk {prodi['name']}")
        except:
            total_pages = 1

        for i in range(1, total_pages + 1):
            url = f"{prodi['url']}/{i}"
            try:
                r = requests.get(url)
                r.raise_for_status()
                soup = BeautifulSoup(r.content, "html.parser")
                jurnals = soup.select('li[data-cat="#luxury"]')

                if not jurnals:
                    break

                for jurnal in jurnals:
                    jurnal_url = jurnal.select_one('a.gray.button')['href']
                    try:
                        response = requests.get(jurnal_url)
                        response.raise_for_status()
                        soup1 = BeautifulSoup(response.content, "html.parser")
                        isi = soup1.select_one('div#content_journal')
                        if isi:
                            judul = get_text_or_na(isi, ['a.title', 'b.title', 'h2.title'])
                            penulis = get_data_from_span(isi, "Penulis")
                            pembimbing_pertama = get_data_from_span(isi, "Dosen Pembimbing I")
                            pembimbing_kedua = get_data_from_span(isi, "Dosen Pembimbing II")
                            abstrak_indonesia = get_abstract_robust(isi, ["Abstraksi", "Abstrak"])
                            abstrak_inggris = get_abstract_robust(isi, ["Abstraction", "Abstract", "ABSTRACT"])

                            all_data["penulis"].append(penulis)
                            all_data["judul"].append(judul)
                            all_data["pembimbing_pertama"].append(pembimbing_pertama)
                            all_data["pembimbing_kedua"].append(pembimbing_kedua)
                            all_data["abstrak_indonesia"].append(abstrak_indonesia)
                            all_data["abstrak_inggris"].append(abstrak_inggris)
                            all_data["prodi"].append(prodi['name'])

                            total_data_count += 1
                            print(f"Data ke-{total_data_count} berhasil diambil")
                    except Exception as e:
                        print(f"Gagal proses {jurnal_url}: {e}", file=sys.stderr)

                time.sleep(1)

            except Exception as e:
                print(f"Gagal ambil URL {url}: {e}", file=sys.stderr)
                continue

    df = pd.DataFrame(all_data)
    df.to_csv("pta_manajemen_raw.csv", index=False)
    print(f"✅ Scraping selesai. Total data: {total_data_count}")
    print("Data disimpan di pta_manajemen_raw.csv")
    return df

if __name__ == "__main__":
    scrape_manajemen_raw()

--- MULAI SCRAPING (RAW DATA) ---
Ditemukan 207 halaman untuk Manajemen
Data ke-1 berhasil diambil
Data ke-2 berhasil diambil
Data ke-3 berhasil diambil
Data ke-4 berhasil diambil
Data ke-5 berhasil diambil
Data ke-6 berhasil diambil
Data ke-7 berhasil diambil
Data ke-8 berhasil diambil
Data ke-9 berhasil diambil
Data ke-10 berhasil diambil
Data ke-11 berhasil diambil
Data ke-12 berhasil diambil
Data ke-13 berhasil diambil
Data ke-14 berhasil diambil
Data ke-15 berhasil diambil
Data ke-16 berhasil diambil
Data ke-17 berhasil diambil
Data ke-18 berhasil diambil
Data ke-19 berhasil diambil
Data ke-20 berhasil diambil
Data ke-21 berhasil diambil
Data ke-22 berhasil diambil
Data ke-23 berhasil diambil
Data ke-24 berhasil diambil
Data ke-25 berhasil diambil
Data ke-26 berhasil diambil
Data ke-27 berhasil diambil
Data ke-28 berhasil diambil
Data ke-29 berhasil diambil
Data ke-30 berhasil diambil
Data ke-31 berhasil diambil
Data ke-32 berhasil diambil
Data ke-33 berhasil diambil
Data ke-34 be

In [3]:
import pandas as pd

# baca file hasil scraping
df_raw = pd.read_csv("pta_manajemen_raw.csv")

print("=== DATA HASIL CRAWLING (RAW) ===")
print(df_raw.head(10))   # tampilkan 10 baris pertama
print("\nJumlah data:", len(df_raw))

=== DATA HASIL CRAWLING (RAW) ===
                   penulis                                              judul  \
0                  SATIYAH  PENGARUH FAKTOR-FAKTOR PELATIHAN DAN PENGEMBAN...   
1                  Faishal  ANALISIS PERSEPSI BRAND ASSOCIATION MENURUT PE...   
2          Wahyu Kurniawan  PENGARUH GAYA KEPEMIMPINAN DEMOKRATIK TERHADAP...   
3   Muhammad Zakaria Utomo  Pengukuran Website Quality Pada Situs Sistem A...   
4  Hendri Wahyudi Prayitno  PENGARUH KEPEMIMPINAN DAN KOMPENSASI TERHADAP ...   
5               Aththaariq  PENGARUH KOMPETENSI DOSEN TERHADAP KINERJA DOS...   
6           Haryono Arifin  PENGARUH PERILAKU KONSUMEN TERHADAP KEPUTUSAN ...   
7       Dharma Abidin Syah  PENGARUH TIPE KEPEMIMPINAN TERHADAP PRESTASI K...   
8            Toni Budianto  PENGARUH DIMENSI KUALITAS PELAYANAN TERHADAP K...   
9     Iwan Kurniawan Gomes  ANALISIS TINGKAT RISIKO KREDIT \r\nPADA PD. BP...   

                   pembimbing_pertama                        pembimbing_ke

## Preprocessing Crawling PTA

In [5]:
import pandas as pd
import re, string
from bs4 import BeautifulSoup
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spellchecker import SpellChecker
from collections import Counter
from tqdm import tqdm

# Setup tqdm untuk apply pandas
tqdm.pandas()

# Kamus slang
contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "ngga": "tidak", "gk": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya", "lu": "kamu", "loe": "kamu",
    "dah": "sudah", "udah": "sudah", "aja": "saja", "yg": "yang", "utk": "untuk",
    "tp": "tetapi", "tapi": "tetapi", "bgt": "sekali", "lg": "lagi"
}

stop_words = set([
    "yang","di","ke","dan","dari","ini","itu","pada","untuk",
    "dengan","sebagai","adalah","merupakan","dalam","yaitu",
    "suatu","sebuah","akan","telah","bisa","agar","oleh",
    "bahwa","juga","atau","tidak","namun","tetapi","kemudian"
])

# Stemmer dan SpellChecker
factory = StemmerFactory()
stemmer = factory.create_stemmer()
spell = SpellChecker()

# --- Fungsi Preprocessing ---
def clean_base_text(text):
    if not isinstance(text, str): return ''
    text = text.lower()
    words = text.split()
    words = [contractions_dict.get(w, w) for w in words]
    text = ' '.join(words)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = BeautifulSoup(text, "html.parser").get_text()
    return text

def tokenize(text): return text.split()
def remove_stopwords(tokens): return [w for w in tokens if w not in stop_words]
def stemming(tokens): return stemmer.stem(' '.join(tokens)).split()
def correct(tokens): return [spell.correction(w) or w for w in tokens]

# --- Main Preprocessing ---
def preprocess():
    print("📂 Membaca file CSV...")
    df = pd.read_csv("pta_manajemen_raw.csv")
    print(f"✅ Dataset terbaca, jumlah data: {len(df)} baris\n")

    print("🔄 Tahap 1: Cleaning...")
    df["abstrak_indonesia_clean"] = df["abstrak_indonesia"].progress_apply(clean_base_text)

    print("🔄 Tahap 2: Tokenizing...")
    df["abstrak_indonesia_tokens"] = df["abstrak_indonesia_clean"].progress_apply(tokenize)

    print("🔄 Tahap 3: Remove Stopwords...")
    df["abstrak_indonesia_stopwords"] = df["abstrak_indonesia_tokens"].progress_apply(remove_stopwords)

    print("🔄 Tahap 4: Stemming...")
    df["abstrak_indonesia_stemmed"] = df["abstrak_indonesia_stopwords"].progress_apply(stemming)

    print("🔄 Tahap 5: Spell Correction...")
    df["abstrak_indonesia_corrected"] = df["abstrak_indonesia_stemmed"].progress_apply(correct)

    print("🔄 Tahap 6: Hitung Frekuensi Kata...")
    df["frekuensi_kata_indonesia"] = df["abstrak_indonesia_stemmed"].progress_apply(lambda x: dict(Counter(x)))

    print("\n💾 Menyimpan hasil preprocessing ke pta_manajemen_preprocessed.csv...")
    df.to_csv("pta_manajemen_preprocessed.csv", index=False)
    print("✅ Preprocessing selesai.")

if __name__ == "__main__":
    preprocess()

📂 Membaca file CSV...
✅ Dataset terbaca, jumlah data: 1031 baris

🔄 Tahap 1: Cleaning...


100%|████████████████████████████████████████████████████████████████████████████| 1031/1031 [00:00<00:00, 4258.63it/s]


🔄 Tahap 2: Tokenizing...


100%|███████████████████████████████████████████████████████████████████████████| 1031/1031 [00:00<00:00, 79988.30it/s]


🔄 Tahap 3: Remove Stopwords...


100%|███████████████████████████████████████████████████████████████████████████| 1031/1031 [00:00<00:00, 48555.21it/s]


🔄 Tahap 4: Stemming...


100%|██████████████████████████████████████████████████████████████████████████████| 1031/1031 [08:47<00:00,  1.96it/s]


🔄 Tahap 5: Spell Correction...


100%|████████████████████████████████████████████████████████████████████████████| 1031/1031 [8:41:51<00:00, 30.37s/it]


🔄 Tahap 6: Hitung Frekuensi Kata...


100%|███████████████████████████████████████████████████████████████████████████| 1031/1031 [00:00<00:00, 31355.93it/s]



💾 Menyimpan hasil preprocessing ke pta_manajemen_preprocessed.csv...
✅ Preprocessing selesai.


In [6]:
import pandas as pd

# baca file hasil preprocessing
df_pre = pd.read_csv("pta_manajemen_preprocessed.csv")

print("=== DATA HASIL PREPROCESSING ===")
print(df_pre.head(10))   # tampilkan 10 baris pertama
print("\nJumlah data:", len(df_pre))

=== DATA HASIL PREPROCESSING ===
                   penulis                                              judul  \
0                  SATIYAH  PENGARUH FAKTOR-FAKTOR PELATIHAN DAN PENGEMBAN...   
1                  Faishal  ANALISIS PERSEPSI BRAND ASSOCIATION MENURUT PE...   
2          Wahyu Kurniawan  PENGARUH GAYA KEPEMIMPINAN DEMOKRATIK TERHADAP...   
3   Muhammad Zakaria Utomo  Pengukuran Website Quality Pada Situs Sistem A...   
4  Hendri Wahyudi Prayitno  PENGARUH KEPEMIMPINAN DAN KOMPENSASI TERHADAP ...   
5               Aththaariq  PENGARUH KOMPETENSI DOSEN TERHADAP KINERJA DOS...   
6           Haryono Arifin  PENGARUH PERILAKU KONSUMEN TERHADAP KEPUTUSAN ...   
7       Dharma Abidin Syah  PENGARUH TIPE KEPEMIMPINAN TERHADAP PRESTASI K...   
8            Toni Budianto  PENGARUH DIMENSI KUALITAS PELAYANAN TERHADAP K...   
9     Iwan Kurniawan Gomes  ANALISIS TINGKAT RISIKO KREDIT \r\nPADA PD. BP...   

                   pembimbing_pertama                        pembimbing_ked

## Page & Link Keluar PTA

In [7]:
import urllib3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

# matikan warning SSL insecure
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_all_links(base_url, max_pages=50):
    visited = set()
    results = []

    def scrape_page(url):
        try:
            response = requests.get(url, verify=False, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            # ambil semua link keluar
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                results.append({
                    "Page": url,
                    "Link Keluar": full_link
                })

            # cari link internal untuk dilanjutkan
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                # hanya ambil link internal (masih di domain utama)
                if full_link.startswith(base_url) and full_link not in visited:
                    visited.add(full_link)
                    if len(visited) < max_pages:
                        scrape_page(full_link)

        except Exception as e:
            print(f"⚠️ Gagal akses {url}: {e}")

    # mulai dari base_url
    visited.add(base_url)
    scrape_page(base_url)

    # rapikan dataframe
    df = pd.DataFrame(results).reset_index(drop=True)
    df.index += 1
    df.insert(0, "No", df.index)
    return df

# contoh penggunaan
url = "https://manajemen.trunojoyo.ac.id/"
df_links = scrape_all_links(url, max_pages=30)  # max_pages = batas biar ga infinite loop

df_links
# df_links.to_csv("semua_link.csv", index=False, encoding="utf-8-sig")

Unnamed: 0,No,Page,Link Keluar
1,1,https://manajemen.trunojoyo.ac.id/,https://manajemen.trunojoyo.ac.id/#content
2,2,https://manajemen.trunojoyo.ac.id/,tel:082330605254
3,3,https://manajemen.trunojoyo.ac.id/,mailto:manajemen@trunojoyo.ac.id
4,4,https://manajemen.trunojoyo.ac.id/,https://manajemen.trunojoyo.ac.id/
5,5,https://manajemen.trunojoyo.ac.id/,https://manajemen.trunojoyo.ac.id/
...,...,...,...
2469,2469,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://manajemen.trunojoyo.ac.id/visit-compan...
2470,2470,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://manajemen.trunojoyo.ac.id/internationa...
2471,2471,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://wordpress.org/
2472,2472,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://wenthemes.com/


## Preprocessing Berita

In [3]:
import pandas as pd
import re
import string
import sys
import time
import random
from collections import Counter
from tqdm import tqdm
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spellchecker import SpellChecker
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# ===============================
# KONFIGURASI PREPROCESSING
# ===============================

# Kamus normalisasi kata tidak baku → baku
contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "ngga": "tidak", "gk": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya", "lu": "kamu", "loe": "kamu",
    "dah": "sudah", "udah": "sudah", "aja": "saja", "ajah": "saja",
    "yg": "yang", "utk": "untuk", "dlm": "dalam", "dr": "dari", "dg": "dengan",
    "jd": "jadi", "krn": "karena", "tp": "tetapi", "tapi": "tetapi",
    "banget": "sekali", "bgt": "sekali", "lg": "lagi",
}

# Inisialisasi alat bantu preprocessing
stemmer = StemmerFactory().create_stemmer()
spell = SpellChecker()
stopword_remover = StopWordRemoverFactory().create_stop_word_remover()


# ===============================
# FUNGSI-FUNGSI PREPROCESSING
# ===============================

def clean_base_text(text):
    """Membersihkan teks dari angka, simbol, dan mengganti slang."""
    if not isinstance(text, str):
        return ''
    words = text.split()
    expanded_words = [contractions_dict.get(word.lower(), word) for word in words]
    text = ' '.join(expanded_words)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def tokenize_text(text):
    """Tokenisasi teks menjadi daftar kata."""
    return text.split()


def remove_stopwords(tokens):
    """Menghapus kata umum (stopwords)."""
    text = ' '.join(tokens)
    text = stopword_remover.remove(text)
    return text.split()


def apply_stemming(tokens):
    """Melakukan stemming (mengubah ke kata dasar)."""
    text = ' '.join(tokens)
    stemmed_text = stemmer.stem(text)
    return stemmed_text.split()


def correct_spelling(tokens):
    """Melakukan pembetulan ejaan kata menggunakan SpellChecker."""
    corrected = [spell.correction(word) if spell.correction(word) else word for word in tokens]
    return corrected


# ===============================
# FUNGSI UTAMA PREPROCESSING
# ===============================

def preprocessing_berita(input_file="crawling_detik_berita.csv", output_file="hasil_preprocessing_berita.csv"):
    print("📥 Membaca data dari file:", input_file)
    df = pd.read_csv(input_file)
    print(f"✅ Data berhasil dimuat. Total {len(df)} baris.\n")

    processed_data = []
    start_time = time.time()

    # Gunakan tqdm untuk progress bar
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="🔄 Memproses Berita", ncols=90):
        content = str(row.get("isi_berita_original", ""))
        title = str(row.get("judul_berita", ""))
        category = str(row.get("kategori_berita", ""))

        if not content.strip():
            continue

        # === Tahapan preprocessing ===
        clean_text = clean_base_text(content)
        tokens = tokenize_text(clean_text)
        stop_removed = remove_stopwords(tokens)
        stemmed = apply_stemming(stop_removed)
        corrected = correct_spelling(stemmed)
        freq = Counter(corrected)

        processed_data.append({
            "id_berita": row.get("id_berita"),
            "judul_berita": title,
            "kategori_berita": category,
            "isi_berita_original": content,
            "isi_berita_bersih": clean_text,
            "isi_berita_diproses": ' '.join(corrected),
            "jumlah_kata": len(corrected),
            "frekuensi_kata": dict(freq)
        })

        time.sleep(random.uniform(0.1, 0.3))  # jeda kecil agar progress bar smooth

    result_df = pd.DataFrame(processed_data)
    result_df.to_csv(output_file, index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    menit, detik = divmod(elapsed, 60)
    print(f"\n✅ Preprocessing selesai!")
    print(f"📊 Total berita diproses: {len(result_df)}")
    print(f"💾 File hasil disimpan sebagai: {output_file}")
    print(f"⏱️ Durasi: {menit} menit {detik} detik")
    print("\n📌 Contoh hasil 5 berita pertama:")
    print(result_df.head())

    return result_df


# ===============================
# EKSEKUSI
# ===============================
if __name__ == "__main__":
    preprocessing_berita()


📥 Membaca data dari file: crawling_detik_berita.csv
✅ Data berhasil dimuat. Total 701 baris.



🔄 Memproses Berita: 100%|███████████████████████████| 701/701 [10:35:37<00:00, 54.40s/it]


✅ Preprocessing selesai!
📊 Total berita diproses: 701
💾 File hasil disimpan sebagai: hasil_preprocessing_berita.csv
⏱️ Durasi: 635 menit 37 detik

📌 Contoh hasil 5 berita pertama:
   id_berita                                       judul_berita  \
0    8158197  Pria di Bekasi Curi Kabel Rel KA, KAI Ungkap R...   
1    8158178  3 Pelaku Ditangkap Terkait 2 Orang di Kintaman...   
2    8158168  Bripda Aprilia Eka Raih Medali Emas di Uzbekis...   
3    8158166  DKI Gratiskan Layanan Angkut Sampah Besar, War...   
4    8158148  Video Call Terakhir Letda Fauzy dengan Ayah Se...   

  kategori_berita                                isi_berita_original  \
0         politik  Petugas pengamanan (PAM) PT Kereta Api Indones...   
1         politik  Polisi telah mengamankan tiga orang dari perke...   
2         politik  Bripda Aprilia Eka Putri Lumbantungkup menjuar...   
3         politik  Pemerintah Provinsi DKI Jakartakini menggratis...   
4         politik  Perwira muda TNI AD, Letda Inf Fauzy 


