In [None]:
# ============================================================
# Mata Kuliah : Sistem Temu Kembali Informasi
# Topik       : Boolean Retrieval Model
# Pertemuan   : Minggu 3
# Dosen       : Tim STKI
# ============================================================

print("🔍 Boolean Retrieval Model - Sistem Temu Kembali Informasi")

🔍 Boolean Retrieval Model - Sistem Temu Kembali Informasi


In [None]:
import pandas as pd
import re

In [3]:
# Contoh dataset dari materi pertemuan 3 (Boolean Model)
documents = {
    "Doc1": "New home sales top forecasts",
    "Doc2": "Home sales rise in july",
    "Doc3": "Increase in home sales in july",
    "Doc4": "July new home sales rise"
}

for name, content in documents.items():
    print(f"{name}: {content}")


Doc1: New home sales top forecasts
Doc2: Home sales rise in july
Doc3: Increase in home sales in july
Doc4: July new home sales rise


In [4]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

tokenized_docs = {doc: preprocess(content) for doc, content in documents.items()}

print("=== TOKEN HASIL PREPROCESSING ===")
for doc, tokens in tokenized_docs.items():
    print(f"{doc}: {tokens}")


=== TOKEN HASIL PREPROCESSING ===
Doc1: ['new', 'home', 'sales', 'top', 'forecasts']
Doc2: ['home', 'sales', 'rise', 'in', 'july']
Doc3: ['increase', 'in', 'home', 'sales', 'in', 'july']
Doc4: ['july', 'new', 'home', 'sales', 'rise']


In [5]:
# Membuat daftar kata unik (vocabulary)
vocab = sorted(set(word for words in tokenized_docs.values() for word in words))

# Membuat incidence matrix (biner)
matrix = pd.DataFrame(0, index=vocab, columns=documents.keys())

for term in vocab:
    for doc, tokens in tokenized_docs.items():
        if term in tokens:
            matrix.loc[term, doc] = 1

print("=== INCIDENCE MATRIX ===")
display(matrix)


=== INCIDENCE MATRIX ===


Unnamed: 0,Doc1,Doc2,Doc3,Doc4
forecasts,1,0,0,0
home,1,1,1,1
in,0,1,1,0
increase,0,0,1,0
july,0,1,1,1
new,1,0,0,1
rise,0,1,0,1
sales,1,1,1,1
top,1,0,0,0


In [6]:
def get_vector(term):
    """Mengambil vektor biner untuk sebuah kata."""
    term = term.lower()
    if term in matrix.index:
        return matrix.loc[term].tolist()
    else:
        return [0] * len(documents)

def boolean_not(vector):
    return [1 - x for x in vector]

def boolean_and(v1, v2):
    return [a & b for a, b in zip(v1, v2)]

def boolean_or(v1, v2):
    return [a | b for a, b in zip(v1, v2)]


In [11]:
def evaluate_query(query):
    """
    Evaluasi query boolean sederhana (tanpa tanda kurung).
    Mendukung urutan operator NOT > AND > OR.
    """
    tokens = query.upper().split()

    # Buat salinan agar tidak mengubah list asli
    tokens = [t for t in tokens if t.strip() != ""]

    # STEP 1: Ganti semua term menjadi vector
    token_vectors = []
    for token in tokens:
        if token not in {"AND", "OR", "NOT"}:
            token_vectors.append((token, get_vector(token)))
        else:
            token_vectors.append((token, None))

    # STEP 2: Tangani operator NOT lebih dulu
    i = 0
    while i < len(token_vectors):
        if token_vectors[i][0] == "NOT":
            if i + 1 < len(token_vectors) and token_vectors[i + 1][1] is not None:
                negated = boolean_not(token_vectors[i + 1][1])
                token_vectors[i:i + 2] = [("TEMP", negated)]
            else:
                raise ValueError("Kesalahan sintaks: operator NOT tanpa term sesudahnya.")
        else:
            i += 1

    # STEP 3: Tangani operator AND
    i = 0
    while i < len(token_vectors):
        if token_vectors[i][0] == "AND":
            left = token_vectors[i - 1][1]
            right = token_vectors[i + 1][1]
            combined = boolean_and(left, right)
            token_vectors[i - 1:i + 2] = [("TEMP", combined)]
        else:
            i += 1

    # STEP 4: Tangani operator OR
    i = 0
    while i < len(token_vectors):
        if token_vectors[i][0] == "OR":
            left = token_vectors[i - 1][1]
            right = token_vectors[i + 1][1]
            combined = boolean_or(left, right)
            token_vectors[i - 1:i + 2] = [("TEMP", combined)]
        else:
            i += 1

    # STEP 5: Hasil akhir harus 1 elemen
    if len(token_vectors) != 1:
        raise ValueError("Query tidak valid atau urutan operator salah.")
    return token_vectors[0][1]


In [15]:
queries = [
    "Home AND Sales AND NOT July",
    "Home AND July AND NOT Sales",
    "Sales OR NOT July"
]

for q in queries:
    result = evaluate_query(q)
    matched_docs = [doc for doc, val in zip(documents.keys(), result) if val == 1]
    print(f"Query: {q}")
    print(f"Vector Hasil: {result}")
    print(f"Dokumen relevan: {matched_docs}")
    print("-" * 50)


Query: Home AND Sales AND NOT July
Vector Hasil: [1, 0, 0, 0]
Dokumen relevan: ['Doc1']
--------------------------------------------------
Query: Home AND July AND NOT Sales
Vector Hasil: [0, 0, 0, 0]
Dokumen relevan: []
--------------------------------------------------
Query: Sales OR NOT July
Vector Hasil: [1, 1, 1, 1]
Dokumen relevan: ['Doc1', 'Doc2', 'Doc3', 'Doc4']
--------------------------------------------------


In [16]:
print("""
Kesimpulan:
Model Boolean Retrieval menggunakan logika AND, OR, dan NOT untuk menentukan relevansi dokumen.
- Hasilnya bersifat biner (1 = relevan, 0 = tidak relevan)
- Incidence Matrix menunjukkan kemunculan kata pada dokumen
- Query Boolean memungkinkan kombinasi logika pencarian

Contoh:
Query: Home AND Sales AND NOT July
Artinya mencari dokumen yang mengandung 'home' dan 'sales' tetapi tidak mengandung 'july'.
""")



Kesimpulan:
Model Boolean Retrieval menggunakan logika AND, OR, dan NOT untuk menentukan relevansi dokumen.
- Hasilnya bersifat biner (1 = relevan, 0 = tidak relevan)
- Incidence Matrix menunjukkan kemunculan kata pada dokumen
- Query Boolean memungkinkan kombinasi logika pencarian

Contoh:
Query: Home AND Sales AND NOT July
Artinya mencari dokumen yang mengandung 'home' dan 'sales' tetapi tidak mengandung 'july'.



In [17]:
matrix.to_csv("incidence_matrix.csv", index=True, encoding="utf-8")
print("Incidence Matrix disimpan ke file 'incidence_matrix.csv'")

Incidence Matrix disimpan ke file 'incidence_matrix.csv'
