# Crawling and Preprocessing PTA Management


In [3]:
!pip install Sastrawi



In [2]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.3


In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import time
from tqdm import tqdm

base_url = "https://pta.trunojoyo.ac.id/c_search/byprod/7/"
detail_base = "https://pta.trunojoyo.ac.id"

total_pages = 207

abstrak_list = []

for page in tqdm(range(1, total_pages + 1), desc="Mengambil halaman"):
    url = f"{base_url}{page}"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    links = soup.select("ul.items.list_style li a.gray.button")

    for link in links:
        detail_url = urljoin(detail_base, link["href"])
        try:
            r_detail = requests.get(detail_url)
            soup_detail = BeautifulSoup(r_detail.text, "html.parser")

            abstrak_tag = soup_detail.find("p", {"align": "justify"})
            if abstrak_tag:
                abstrak = abstrak_tag.text.strip()
                abstrak_list.append(abstrak)
        except Exception as e:
            print(f" Gagal ambil {detail_url}: {e}")

df = pd.DataFrame({"abstrak_raw": abstrak_list})
df.to_csv("pta_manajemen_raw.csv", index=False, encoding="utf-8", sep="\t")

print(f"Selesai! Total abstrak diambil: {len(abstrak_list)}")
print("Disimpan ke pta_manajemen_raw.csv")


Mengambil halaman: 100%|██████████| 207/207 [27:27<00:00,  7.96s/it]

Selesai! Total abstrak diambil: 1031
Disimpan ke pta_manajemen_raw.csv





Preprocessing yang dilakukan yakni proses:

- Lowercasing (semua huruf diubah menjadi huruf kecil)
- Normalisasi Slang (kata tidak baku)
- Penghapusan angka
- Penghapusan tanda baca
- Penghapusan spasi berlebih
- Tokenisasi
- Penghapsan Stopword
- Stemming
- Menghitung jumlah kemunculan kata

In [None]:
import pandas as pd
import re
import string
from collections import Counter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


df = pd.read_csv("pta_manajemen_raw.csv", sep=";")
df.columns = df.columns.str.strip()

if "abstrak_raw" not in df.columns:
    raise KeyError(f"Kolom 'abstrak_raw' tidak ditemukan. Kolom yang ada: {df.columns.tolist()}")

stemmer = StemmerFactory().create_stemmer()
stop_remover = StopWordRemoverFactory().create_stop_word_remover()

contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya",
    "lu": "kamu", "loe": "kamu",
    "udah": "sudah", "dah": "sudah", "aja": "saja",
    "yg": "yang", "utk": "untuk", "dlm": "dalam", "dr": "dari", "dg": "dengan",
    "jd": "jadi", "krn": "karena", "tp": "tetapi", "tapi": "tetapi"
}

def normalize_slang(text):
    words = text.split()
    return " ".join([contractions_dict.get(w, w) for w in words])

def clean_text(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = normalize_slang(text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text): return text.split()
def remove_stopwords(text): return stop_remover.remove(text)
def apply_stemming(text): return stemmer.stem(text)
def word_frequency(tokens): return "; ".join([f"{w}:{c}" for w, c in Counter(tokens).most_common()])

results = []
for idx, abstrak in enumerate(df["abstrak_raw"], start=1):
    print(f"Sedang memproses baris {idx}/{len(df)}...")

    raw = abstrak
    clean = clean_text(raw)
    tokens = tokenize(clean)
    stop_removed = remove_stopwords(clean)
    stemmed = apply_stemming(stop_removed)
    freq = word_frequency(tokenize(stemmed))

    results.append({
        "abstrak_raw": raw,
        "abstrak_clean": clean,
        "tokens": " ".join(tokens),
        "stop_removed": stop_removed,
        "stemmed": stemmed,
        "frekuensi": freq,
        "final_preprocessed": stemmed
    })

df_preprocessed = pd.DataFrame(results)
df_preprocessed.to_csv("pta_manajemen_preprocessed.csv", index=False, encoding="utf-8")
print("Preprocessing selesai! Hasil disimpan di pta_manajemen_preprocessed.csv")


Sedang memproses baris 1/1031...
Sedang memproses baris 2/1031...
Sedang memproses baris 3/1031...
Sedang memproses baris 4/1031...
Sedang memproses baris 5/1031...
Sedang memproses baris 6/1031...
Sedang memproses baris 7/1031...
Sedang memproses baris 8/1031...
Sedang memproses baris 9/1031...
Sedang memproses baris 10/1031...
Sedang memproses baris 11/1031...
Sedang memproses baris 12/1031...
Sedang memproses baris 13/1031...
Sedang memproses baris 14/1031...
Sedang memproses baris 15/1031...
Sedang memproses baris 16/1031...
Sedang memproses baris 17/1031...
Sedang memproses baris 18/1031...
Sedang memproses baris 19/1031...
Sedang memproses baris 20/1031...
Sedang memproses baris 21/1031...
Sedang memproses baris 22/1031...
Sedang memproses baris 23/1031...
Sedang memproses baris 24/1031...
Sedang memproses baris 25/1031...
Sedang memproses baris 26/1031...
Sedang memproses baris 27/1031...
Sedang memproses baris 28/1031...
Sedang memproses baris 29/1031...
Sedang memproses baris 