# CRAWLING PTA MANAJEMEN

In [None]:
!pip install builtwith
!pip install nltk
!pip install Sastrawi
!pip install pyspellchecker

Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: builtwith
  Building wheel for builtwith (setup.py) ... [?25l[?25hdone
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36077 sha256=f29094318d557c75bce2f55da1b54274f018be4ddcd1f78c64e98f0d3d6bbe8c
  Stored in directory: /root/.cache/pip/wheels/7f/2d/b2/606e3df914d4aeeab99c4a4e3e9a61673d2293c2e346db00c8
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Collecting pyspellchecker
  Downloading pys

## 1. Crawling PTA

In [None]:
import builtwith

# Analisis teknologi yang digunakan
res = builtwith.parse('https://pta.trunojoyo.ac.id')
print(res)

{'web-servers': ['Nginx'], 'javascript-frameworks': ['jQuery', 'jQuery UI']}


## Preprocessing Crawling PTA

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spellchecker import SpellChecker
from collections import Counter

# --- KONFIGURASI PRA-PEMROSESAN ---
# Kamus untuk normalisasi kata slang/kontraksi bahasa Indonesia
contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "ngga": "tidak", "gk": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya", "lu": "kamu", "loe": "kamu",
    "dah": "sudah", "udah": "sudah", "aja": "saja", "ajah": "saja",
    "yg": "yang", "utk": "untuk", "dlm": "dalam", "dr": "dari", "dg": "dengan",
    "jd": "jadi", "krn": "karena", "tp": "tetapi", "tapi": "tetapi",
    "banget": "sekali", "bgt": "sekali", "lg": "lagi",
}

# Daftar stopword bahasa Indonesia
stop_words = set([
    "yang", "di", "ke", "dan", "dari", "ini", "itu", "pada", "untuk",
    "dengan", "sebagai", "adalah", "merupakan", "dalam", "yaitu",
    "suatu", "sebuah", "dengan", "akan", "telah", "bisa", "agar",
    "dari", "oleh", "hal", "saat", "bahwa", "juga", "atau", "tidak",
    "namun", "tetapi", "kemudian", "sehingga", "serta", "guna",
    "seperti", "yaitu", "melalui", "terhadap", "seperti", "sejak",
    "saja", "hanya", "oleh", "pada", "sampai", "setelah", "sebelum",
    "karena", "maka", "tanpa"
])

# Inisialisasi Stemmer dari Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Inisialisasi SpellChecker
spell = SpellChecker()

# --- FUNGSI-FUNGSI PRA-PEMROSESAN TERPISAH ---

def clean_base_text(text):
    """
    Fungsi dasar untuk membersihkan teks dari simbol, angka, dll.
    """
    if not isinstance(text, str):
        return ''

    text = text.lower()
    words = text.split()
    expanded_words = [contractions_dict.get(word, word) for word in words]
    text = ' '.join(expanded_words)

    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = BeautifulSoup(text, "html.parser").get_text()

    return text

def tokenize_text(text):
    """
    # Proses: Tokenisasi
    # Memisahkan teks menjadi daftar kata (token).
    """
    return text.split()

def remove_stopwords(tokens):
    """
    # Proses: Penghapusan Kata Umum (Stopword)
    # Menghapus kata-kata yang tidak memiliki makna penting dalam analisis teks.
    """
    return [word for word in tokens if word not in stop_words]

def apply_stemming_and_lemmatization(tokens):
    """
    # Proses: Stemming dan Lematisasi
    # Mengubah kata berimbuhan menjadi kata dasar.
    """
    text = ' '.join(tokens)
    stemmed_text = stemmer.stem(text)
    return stemmed_text.split()

def correct_spelling(tokens):
    """
    # Proses: Cek Ejaan Pembakuan Kata
    # Memperbaiki ejaan kata.
    """
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in tokens]
    return corrected_words

# --- FUNGSI-FUNGSI BANTUAN SCRAPING ---

def get_text_or_na(soup, selectors):
    """Mencoba beberapa selektor untuk menemukan teks, mengembalikan 'N/A' jika tidak ditemukan."""
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            text = element.get_text(strip=True)
            if text and text.strip():
                return text.strip()
    return 'N/A'

def get_data_from_span(soup, text_contains):
    """Mencari span yang berisi teks tertentu dan mengekstrak nama setelah titik dua."""
    span = soup.select_one(f'span:-soup-contains("{text_contains}")')
    if span:
        text = span.get_text(strip=True)
        if ':' in text:
            return text.split(':', 1)[1].strip()
    return 'N/A'

def get_abstract_robust(soup, keywords):
    """
    Mencari tag heading (b) yang teksnya mengandung salah satu dari kata kunci
    dan mengekstrak paragraf berikutnya.
    """
    for tag_b in soup.find_all('b'):
        tag_text = tag_b.get_text(strip=True)
        if any(keyword.lower() in tag_text.lower() for keyword in keywords):
            parent_div = tag_b.find_parent('div')
            if parent_div:
                next_div_sibling = parent_div.find_next_sibling('div')
                if next_div_sibling:
                    p_tag = next_div_sibling.find('p', align="justify")
                    if p_tag:
                        abstract_text = p_tag.get_text(strip=True)
                        if abstract_text:
                            return abstract_text
    return 'N/A'

def get_total_pages(soup):
    """
    Mengekstrak total jumlah halaman dari navigasi paginasi.
    """
    try:
        pagination = soup.select_one('ol.pagination')
        if pagination:
            last_page_li = pagination.select('li')[-1]
            last_page_link = last_page_li.select_one('a')
            if last_page_link and 'href' in last_page_link.attrs:
                url_path = last_page_link['href']
                return int(url_path.split('/')[-1])
    except (IndexError, ValueError, KeyError):
        pass
    return 1

# --- FUNGSI UTAMA SCRAPING ---

def scrape_manajemen_all_data():
    """
    Menjalankan proses scraping dan pra-pemrosesan data secara keseluruhan.
    """
    prodi_data = [
        {'name': 'Manajemen', 'url': 'https://pta.trunojoyo.ac.id/c_search/byprod/7'},
    ]

    all_scraped_data = {
        "penulis": [], "judul": [], "pembimbing_pertama": [], "pembimbing_kedua": [],
        "abstrak_indonesia_raw": [], "abstrak_inggris_raw": [],
        "abstrak_indonesia_clean": [], "abstrak_inggris_clean": [],
        "abstrak_indonesia_stopwords": [], "abstrak_inggris_stopwords": [],
        "abstrak_indonesia_stemmed": [], "abstrak_inggris_stemmed": [],
        "abstrak_indonesia_corrected": [], "abstrak_inggris_corrected": [],
        "prodi": []
    }

    total_data_count = 0
    print("--- MULAI PROSES SCRAPING Prodi Manajemen ---")

    for prodi in prodi_data:
        print(f"\nScraping data for program: {prodi['name']}")
        print("-" * 50)
        first_page_url = prodi['url']
        try:
            r = requests.get(first_page_url)
            r.raise_for_status()
            soup = BeautifulSoup(r.content, "html.parser")
            total_pages = get_total_pages(soup)
            print(f"Ditemukan {total_pages} halaman untuk {prodi['name']}")
        except requests.exceptions.RequestException as e:
            print(f"ERROR: Gagal mengambil halaman pertama untuk {prodi['name']}: {e}", file=sys.stderr)
            total_pages = 1

        for i in range(1, total_pages + 1):
            url = f"{prodi['url']}/{i}"
            try:
                r = requests.get(url)
                r.raise_for_status()
                soup = BeautifulSoup(r.content, "html.parser")
                jurnals = soup.select('li[data-cat="#luxury"]')

                if not jurnals:
                    print(f"Tidak ada jurnal lagi di halaman {i}. Berhenti untuk program ini.")
                    break

                for jurnal in jurnals:
                    jurnal_url = jurnal.select_one('a.gray.button')['href']
                    try:
                        response = requests.get(jurnal_url)
                        response.raise_for_status()
                        soup1 = BeautifulSoup(response.content, "html.parser")
                        isi = soup1.select_one('div#content_journal')
                        if isi:
                            judul = get_text_or_na(isi, ['a.title', 'b.title', 'h2.title'])
                            penulis = get_data_from_span(isi, "Penulis")
                            pembimbing_pertama = get_data_from_span(isi, "Dosen Pembimbing I")
                            pembimbing_kedua = get_data_from_span(isi, "Dosen Pembimbing II")
                            abstrak_indonesia = get_abstract_robust(isi, ["Abstraksi", "Abstrak"])
                            abstrak_inggris = get_abstract_robust(isi, ["Abstraction", "Abstract", "ABSTRACT"])

                            # --- Rangkaian Proses Pra-pemrosesan Teks ---

                            # 1. Pembersihan Dasar (lowercase, normalisasi slang, hapus simbol & angka)
                            clean_indonesia_text = clean_base_text(abstrak_indonesia)
                            clean_inggris_text = clean_base_text(abstrak_inggris)

                            # 2. Tokenisasi
                            tokens_indonesia = tokenize_text(clean_indonesia_text)
                            tokens_inggris = tokenize_text(clean_inggris_text)

                            # 3. Penghapusan Stopword
                            stopwords_indonesia_tokens = remove_stopwords(tokens_indonesia)
                            stopwords_inggris_tokens = remove_stopwords(tokens_inggris)

                            # 4. Stemming dan Lematisasi
                            stemmed_indonesia_tokens = apply_stemming_and_lemmatization(stopwords_indonesia_tokens)
                            stemmed_inggris_tokens = apply_stemming_and_lemmatization(stopwords_inggris_tokens)

                            # 5. Cek Ejaan (Pembakuan Kata)
                            corrected_indonesia_tokens = correct_spelling(stemmed_indonesia_tokens)
                            corrected_inggris_tokens = correct_spelling(stemmed_inggris_tokens)

                            # Menghitung frekuensi kata untuk abstrak yang sudah diproses
                            word_frequency_indonesia = Counter(stemmed_indonesia_tokens)
                            word_frequency_inggris = Counter(stemmed_inggris_tokens)

                            # Menambahkan data ke dictionary
                            all_scraped_data["penulis"].append(penulis)
                            all_scraped_data["judul"].append(judul)
                            all_scraped_data["pembimbing_pertama"].append(pembimbing_pertama)
                            all_scraped_data["pembimbing_kedua"].append(pembimbing_kedua)
                            all_scraped_data["prodi"].append(prodi['name'])
                            all_scraped_data["abstrak_indonesia_raw"].append(abstrak_indonesia)
                            all_scraped_data["abstrak_inggris_raw"].append(abstrak_inggris)
                            all_scraped_data["abstrak_indonesia_clean"].append(' '.join(tokens_indonesia))
                            all_scraped_data["abstrak_inggris_clean"].append(' '.join(tokens_inggris))
                            all_scraped_data["abstrak_indonesia_stopwords"].append(' '.join(stopwords_indonesia_tokens))
                            all_scraped_data["abstrak_inggris_stopwords"].append(' '.join(stopwords_inggris_tokens))
                            all_scraped_data["abstrak_indonesia_stemmed"].append(' '.join(stemmed_indonesia_tokens))
                            all_scraped_data["abstrak_inggris_stemmed"].append(' '.join(stemmed_inggris_tokens))
                            all_scraped_data["abstrak_indonesia_corrected"].append(' '.join(corrected_indonesia_tokens))
                            all_scraped_data["abstrak_inggris_corrected"].append(' '.join(corrected_inggris_tokens))

                            total_data_count += 1
                            print(f"\n--- Data #{total_data_count} ---")
                            print(f"Prodi: {prodi['name']}")
                            print(f"Penulis: {penulis}")
                            print(f"Judul: {judul}")
                            print(f"Pembimbing 1: {pembimbing_pertama}")
                            print(f"Pembimbing 2: {pembimbing_kedua}")
                            print(f"Abstrak (Raw): {abstrak_indonesia}")
                            print(f"Abstrak (Clean): {clean_indonesia_text}")
                            print(f"Abstrak (Stopwords Removed): {' '.join(stopwords_indonesia_tokens)}")
                            print(f"Abstrak (Stemmed): {' '.join(stemmed_indonesia_tokens)}")
                            print(f"Abstrak (Corrected): {' '.join(corrected_indonesia_tokens)}")

                            # Output frekuensi kata
                            print("\n--- Tokenisasi (Perhitungan Jumlah Kata) ---")
                            print(f"Frekuensi Kata (Indonesia): {word_frequency_indonesia}")
                            print(f"Frekuensi Kata (Inggris): {word_frequency_inggris}")
                            print("-------------------------------------------\n")

                    except Exception as e:
                        print(f"ERROR: Terjadi kesalahan saat memproses URL: {jurnal_url} - {e}", file=sys.stderr)

                time.sleep(1)

            except requests.exceptions.RequestException as e:
                print(f"ERROR: Gagal mengambil URL {url}: {e}", file=sys.stderr)
                continue

    df = pd.DataFrame(all_scraped_data)
    df.to_csv("pta_manajemen_all_preprocessing_steps.csv", index=False)
    print("\n---")
    print(f"Scraping selesai. Total data yang diambil: {total_data_count}")
    print("Data disimpan ke pta_manajemen_all_preprocessing_steps.csv")
    print("---")
    return df

# Menjalankan fungsi utama
scrape_manajemen_all_data()

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
--- Data #407 ---
Prodi: Manajemen
Penulis: Subhan Juniarto
Judul: PENGARUH PENGAWASAN PIMPINAN DAN MOTIVASI PEGAWAI TERHADAP KINERJA DINAS KEPENDUDUKAN DAN PENCATATAN SIPIL KABUPATEN SUMENEP
Pembimbing 1: Drs. Ec. Mudji Kuswinarno, M. Si.
Pembimbing 2: Faidal, SE., MM.
Abstrak (Raw): Rumusan masalah dalam penelitian ini adalah (1) apakah Pengawasan Pimpinan berpengaruh terhadap Kinerja Dinas Kependudukan dan Pencatatan Sipil Kabupaten Sumenep? (2) apakah Motivasi Pegawai berpengaruh terhadap Kinerja Dinas Kependudukan dan Pencatatan Sipil Kabupaten Sumenep? (3) apakah Pengawasan Pimpinan dan Motivasi Pegawai berpengaruh terhadap Kinerja Dinas Kependudukan dan Pencatatan Sipil Kabupaten Sumenep? Penelitian ini menggunakan metode kuantitatif dimana populasi dan sampelnya adalah semua Pegawai Negeri Sipil Dinas Kependudukan dan Pencatatan Sipil Kabupaten Sumenep yang berjumlah 74 pegawai. Berdasarkan hasill peneliti

## Page & Link Keluar PTA

In [None]:
import urllib3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

# matikan warning SSL insecure
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_all_links(base_url, max_pages=50):
    visited = set()
    results = []

    def scrape_page(url):
        try:
            response = requests.get(url, verify=False, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            # ambil semua link keluar
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                results.append({
                    "Page": url,
                    "Link Keluar": full_link
                })

            # cari link internal untuk dilanjutkan
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                # hanya ambil link internal (masih di domain utama)
                if full_link.startswith(base_url) and full_link not in visited:
                    visited.add(full_link)
                    if len(visited) < max_pages:
                        scrape_page(full_link)

        except Exception as e:
            print(f"⚠️ Gagal akses {url}: {e}")

    # mulai dari base_url
    visited.add(base_url)
    scrape_page(base_url)

    # rapikan dataframe
    df = pd.DataFrame(results).reset_index(drop=True)
    df.index += 1
    df.insert(0, "No", df.index)
    return df

# contoh penggunaan
url = "https://manajemen.trunojoyo.ac.id/"
df_links = scrape_all_links(url, max_pages=30)  # max_pages = batas biar ga infinite loop

df_links
# df_links.to_csv("semua_link.csv", index=False, encoding="utf-8-sig")

Unnamed: 0,No,Page,Link Keluar
1,1,https://manajemen.trunojoyo.ac.id/,https://manajemen.trunojoyo.ac.id/#content
2,2,https://manajemen.trunojoyo.ac.id/,tel:082330605254
3,3,https://manajemen.trunojoyo.ac.id/,mailto:manajemen@trunojoyo.ac.id
4,4,https://manajemen.trunojoyo.ac.id/,https://manajemen.trunojoyo.ac.id/
5,5,https://manajemen.trunojoyo.ac.id/,https://manajemen.trunojoyo.ac.id/
...,...,...,...
2469,2469,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://manajemen.trunojoyo.ac.id/visit-compan...
2470,2470,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://manajemen.trunojoyo.ac.id/internationa...
2471,2471,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://wordpress.org/
2472,2472,https://manajemen.trunojoyo.ac.id/kalender-aka...,https://wenthemes.com/
