In [1]:
from google.colab import files

# Upload file ZIP dari komputermu
uploaded = files.upload()


Saving pdf_putusan.zip to pdf_putusan.zip


In [3]:
import zipfile
import os

zip_path = "/content/pdf_putusan.zip"  # ubah jika beda
extract_folder = "/content/pdf_putusan"

os.makedirs(extract_folder, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f"Berhasil diekstrak ke: {extract_folder}")


Berhasil diekstrak ke: /content/pdf_putusan


# **Tahap 1 – Membangun Case Base**

In [4]:
!pip install pdfminer.six



**1 Konversi PDF → Plain Text**

In [5]:
from pdfminer.high_level import extract_text
import os

input_folder = "/content/pdf_putusan/pdf_putusan"
output_folder = "/content/data/raw"
os.makedirs(output_folder, exist_ok=True)

def pdf_to_txt(pdf_path):
    try:
        text = extract_text(pdf_path)
        return text
    except:
        return ""

# Konversi semua PDF
for i, filename in enumerate(sorted(os.listdir(input_folder))):
    if filename.endswith(".pdf"):
        case_id = f"case_{i+1:03d}"
        path_pdf = os.path.join(input_folder, filename)
        isi = pdf_to_txt(path_pdf)

        with open(os.path.join(output_folder, f"{case_id}.txt"), "w", encoding="utf-8") as f:
            f.write(isi)


**2 Pembersihan Teks**

In [6]:
import re

def bersihkan_teks(teks):
    teks = teks.replace('\n', ' ')
    teks = re.sub(r"\s+", " ", teks)
    teks = re.sub(r"\d+\s*/\s*\d+", "", teks)  # hapus nomor halaman
    teks = re.sub(r"Halaman\s+\d+", "", teks)
    return teks.strip()

# Bersihkan semua
for file in os.listdir(output_folder):
    if file.endswith(".txt"):
        path = os.path.join(output_folder, file)
        with open(path, encoding="utf-8") as f:
            teks = f.read()
        teks_bersih = bersihkan_teks(teks)
        with open(path, "w", encoding="utf-8") as f:
            f.write(teks_bersih)


# **Tahap 2 – Representasi Kasus**

**1 Ekstrak Metadata**

In [7]:
import pandas as pd

def ekstrak_metadata(teks):
    no_perkara = re.search(r"(nomor|perkara)[\s:]*([\w/\-.]+)", teks, re.IGNORECASE)
    no_perkara = no_perkara.group(2) if no_perkara else ""
    pihak = re.search(r"antara\s+(.*?)\s+dengan\s+(.*?)\s", teks, re.IGNORECASE)
    pihak = f"{pihak.group(1)} vs {pihak.group(2)}" if pihak else ""
    pasal = re.findall(r"pasal\s+\d+[\s\S]{0,20}", teks, re.IGNORECASE)
    pasal = "; ".join(set(pasal)) if pasal else ""
    fakta = re.search(r"(menimbang|bahwa)[\s\S]{0,500}", teks.lower())
    fakta = fakta.group(0) if fakta else ""
    amar_patterns = [
        r"menolak", r"permohonan.*?ditolak", r"menghukum",
        r"mengabulkan", r"permohonan.*?dikabulkan", r"terkabulkan",
        r"tidak dapat diterima", r"diterima", r"terima"
    ]
    amar = ""
    for pattern in amar_patterns:
        match = re.search(pattern, teks.lower())
        if match:
            amar += match.group(0).strip() + " | "
    return no_perkara, pihak, pasal, fakta.strip(), amar.strip()


**2 Proses Semua Dokumen → CSV**

In [8]:
data = []
for file in os.listdir(output_folder):
    if file.endswith(".txt"):
        with open(os.path.join(output_folder, file), encoding="utf-8") as f:
            teks = f.read()
            no_perkara, pihak, pasal, fakta, amar = ekstrak_metadata(teks)
            data.append({
                "case_id": file.replace(".txt", ""),
                "no_perkara": no_perkara,
                "pihak": pihak,
                "pasal": pasal,
                "ringkasan_fakta": fakta,
                "amar_patterns": amar,
                "isi_teks": teks
            })

df = pd.DataFrame(data)
os.makedirs("/content/data/raw", exist_ok=True)
df.to_csv("/content/data/cases.csv", index=False)
df.head()


Unnamed: 0,case_id,no_perkara,pihak,pasal,ringkasan_fakta,amar_patterns,isi_teks
0,case_022,270,Mak Ake selaku pembeli vs Baharuddin,Pasal 3 ayat (2) Peradilan ; Pasal 1946 hukum ...,"menimbang, bahwa dari surat-surat tersebut ter...",menolak | permohonan tergugat putusan pengadil...,Direktori Putusan Mahkamah Agung Republik Indo...
1,case_005,456,almarkum Haji Mukhtar sebagai wakif vs para,Pasal 30 Undang-Undang Nomor,"menimbang, bahwa dari surat-surat tersebut ter...","menolak | permohonan para tergugat, telah diba...",Direktori Putusan Mahkamah Agung Republik Indo...
2,case_040,982,pihak Penggugat menurut bagiannya masing-masin...,"Pasal 189 (2) Rbg., justru me; Pasal 1365 meng...","menimbang, bahwa dari surat-surat tersebut ter...",menolak | menghukum | mengabulkan | permohonan...,Direktori Putusan Mahkamah Agung Republik Indo...
3,case_020,perdata,lain : - Sebidang tanah yang terletak di Desa ...,Pasal 1868 KUH Perdata dan bil; Pasal 1874 KUH...,"menimbang, bahwa dari surat-surat tersebut ter...",menolak | permohonan - permohonan maupun akta ...,Direktori Putusan Mahkamah Agung Republik Indo...
4,case_030,3743,"Tergugat I, II dan saudara-saudaranya yang lai...",,"menimbang, bahwa dari surat-surat tersebut ter...","menolak | permohonan penggugat i, ii, iii/para...",Direktori Putusan Mahkamah Agung Republik Indo...


**3 Normalisasi Label Amar**

In [9]:
def normalisasi_amar(text):
    text = text.lower()
    if "tidak dapat diterima" in text:
        return "tidak diterima"
    elif "ditolak" in text or "menolak" in text:
        return "menolak"
    elif "dikabulkan" in text or "mengabulkan" in text:
        return "mengabulkan"
    elif "menerima" in text:
        return "menerima"
    elif "menghukum" in text:
        return "menghukum"
    else:
        return "lainnya"

df["label_amar"] = df["amar_patterns"].apply(normalisasi_amar)
df["label_amar"].value_counts()


Unnamed: 0_level_0,count
label_amar,Unnamed: 1_level_1
tidak diterima,33
menolak,7


# **Tahap 3 – Case Retrieval**

**1 TF-IDF Vectorizer**

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["ringkasan_fakta"].fillna(""))


**2 Train-Test Split**

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df["label_amar"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


**3 Model Machine Learning (SVM)**

In [12]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("CLASSIFICATION REPORT")
print(classification_report(y_test, y_pred, target_names=le.classes_))


CLASSIFICATION REPORT
                precision    recall  f1-score   support

       menolak       0.00      0.00      0.00         1
tidak diterima       0.88      1.00      0.93         7

      accuracy                           0.88         8
     macro avg       0.44      0.50      0.47         8
  weighted avg       0.77      0.88      0.82         8



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**4 Fungsi Retrieval dengan Cosine Similarity**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve(query: str, k=5):
    q_vec = vectorizer.transform([query])
    sims = cosine_similarity(q_vec, X).flatten()
    top_k_idx = sims.argsort()[-k:][::-1]
    return df.iloc[top_k_idx]["case_id"].tolist()

# Uji fungsi
retrieve("Penggugat meminta ganti rugi akibat wanprestasi", k=5)


**5 Simpan Queries untuk Evaluasi**

In [13]:
import json
eval_queries = [
    {"query_id": "q1", "query": "Penggugat minta pembatalan jual beli"},
    {"query_id": "q2", "query": "Tergugat tidak membayar sesuai kontrak"},
    {"query_id": "q3", "query": "Gugatan wanprestasi"},
    {"query_id": "q4", "query": "Perbuatan melawan hukum"},
    {"query_id": "q5", "query": "Kontrak dibatalkan sepihak"}
]
os.makedirs("/content/data/eval", exist_ok=True)
with open("/content/data/eval/queries.json", "w") as f:
    json.dump(eval_queries, f, indent=2)


# **Tahap 4 – Solution Reuse**

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve(query: str, k=5):
    q_vec = vectorizer.transform([query])
    sims = cosine_similarity(q_vec, X).flatten()
    top_k_idx = sims.argsort()[-k:][::-1]
    return df.iloc[top_k_idx]["case_id"].tolist()


In [16]:
from collections import Counter

case_solutions = {row["case_id"]: row["label_amar"] for _, row in df.iterrows()}

def predict_outcome(query: str, k=5):
    top_k = retrieve(query, k=k)
    solusi = [case_solutions[c] for c in top_k if c in case_solutions]
    return Counter(solusi).most_common(1)[0][0]

predict_outcome("Tergugat tidak memenuhi isi perjanjian")


'tidak diterima'

**2 Simpan Hasil Prediksi ke CSV**

In [17]:
results = []
for q in eval_queries:
    query = q["query"]
    pred = predict_outcome(query)
    top_ids = retrieve(query)
    results.append({
        "query_id": q["query_id"],
        "query": query,
        "predicted_solution": pred,
        "top_5_case_ids": ", ".join(top_ids)
    })

df_result = pd.DataFrame(results)
os.makedirs("/content/data/results", exist_ok=True)
df_result.to_csv("/content/data/results/predictions.csv", index=False)
df_result


Unnamed: 0,query_id,query,predicted_solution,top_5_case_ids
0,q1,Penggugat minta pembatalan jual beli,tidak diterima,"case_013, case_033, case_009, case_002, case_019"
1,q2,Tergugat tidak membayar sesuai kontrak,tidak diterima,"case_016, case_031, case_032, case_010, case_020"
2,q3,Gugatan wanprestasi,tidak diterima,"case_011, case_036, case_026, case_033, case_015"
3,q4,Perbuatan melawan hukum,tidak diterima,"case_024, case_035, case_001, case_026, case_023"
4,q5,Kontrak dibatalkan sepihak,tidak diterima,"case_023, case_033, case_002, case_021, case_009"


# **Tahap 5 – Evaluasi Model**

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

# Pastikan X dan vectorizer dibuat dari isi_teks
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["isi_teks"].fillna(""))

def retrieve(query, k=5):
    q_vec = vectorizer.transform([query])
    sims = cosine_similarity(q_vec, X).flatten()
    top_k_idx = sims.argsort()[-k:][::-1]
    return df.iloc[top_k_idx]["case_id"].tolist()


In [24]:
eval_queries = []

# Ambil 5 case_id dan ringkasan fakta dari dataframe
for i, row in df.head(5).iterrows():
    eval_queries.append({
        "query_id": f"q{i+1}",
        "query": row["ringkasan_fakta"][:100]  # pakai ringkasan fakta sebagai query
    })

# Buat ground truth berdasarkan case_id asli
ground_truth_map = {q["query_id"]: df.iloc[i]["case_id"] for i, q in enumerate(eval_queries)}


In [25]:
def eval_retrieval(queries, ground_truth_map, k=5):
    correct = 0
    hasil = []
    for q in queries:
        qid = q["query_id"]
        query = q["query"]
        gt = ground_truth_map.get(qid)
        retrieved = retrieve(query, k)
        match = gt in retrieved
        hasil.append({
            "query_id": qid,
            "query": query,
            "ground_truth": gt,
            "retrieved_top_k": retrieved,
            "match": match
        })
        if match:
            correct += 1
    acc = correct / len(queries)
    print(f"Accuracy@{k}: {acc:.2f}")
    return pd.DataFrame(hasil)


In [26]:
df_eval = eval_retrieval(eval_queries, ground_truth_map, k=5)
df_eval


Accuracy@5: 0.20


Unnamed: 0,query_id,query,ground_truth,retrieved_top_k,match
0,q1,"menimbang, bahwa dari surat-surat tersebut ter...",case_022,"[case_033, case_002, case_009, case_019, case_...",False
1,q2,"menimbang, bahwa dari surat-surat tersebut ter...",case_005,"[case_006, case_033, case_009, case_002, case_...",True
2,q3,"menimbang, bahwa dari surat-surat tersebut ter...",case_040,"[case_011, case_006, case_007, case_027, case_...",False
3,q4,"menimbang, bahwa dari surat-surat tersebut ter...",case_020,"[case_006, case_027, case_011, case_007, case_...",False
4,q5,"menimbang, bahwa dari surat-surat tersebut ter...",case_030,"[case_006, case_033, case_009, case_002, case_...",False
