In [12]:
import pandas as pd
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [13]:
df = pd.read_csv('query.csv')
query = pd.concat([df.iloc[:, 0], df.iloc[:, 2]], axis=1)
gold_standard = pd.concat([df.iloc[:, 0], df.iloc[:, 3]], axis=1)

print(query)


   doc_id_query                                            abstrak
0             1  Masalah stunting di Indonesia merupakan ancama...
1             2  Systemic Lupus Erytemathosus (SLE) merupakan p...
2             3  Emosi merupakan respons reflektif dari pengala...
3             4  Pemberian kredit adalah salah satu layanan uta...
4             5  Advertorial terselubung atau unlabeled adverto...
5             6  Diabetes melitus (DM) atau diabetes adalah pen...
6             7  Seleksi Nasional Berdasarkan Tes (SNBT) merupa...
7             8  Gangguan penglihatan memiliki prevalensi yang ...
8             9  Pengendalian persediaan merupakan faktor penti...
9            10  Deepfake adalah teknologi yang menggunakan kec...


In [31]:
corpus = pd.read_csv('corpus.csv')
corpus = pd.concat([corpus.iloc[:, 0], corpus.iloc[:, 2]], axis=1)
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = word_tokenize(text)
    
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    return ' '.join(tokens)

corpus['abstrak'] = corpus.iloc[:, 1].apply(preprocess_text)
query['abstrak'] = query.iloc[:, 1].apply(preprocess_text)

stemmer = StemmerFactory().create_stemmer()
def stem_text(text):
    return stemmer.stem(text)

corpus['abstrak_sastrawi'] = corpus['abstrak'].apply(stem_text)
query['abstrak_sastrawi'] = query['abstrak'].apply(stem_text)

In [38]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

def porter_stem_text(text):
    tokens = text.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

corpus['abstrak_porter'] = corpus['abstrak'].apply(porter_stem_text)
query['abstrak_porter'] = query['abstrak'].apply(porter_stem_text)

In [39]:
def rabin_karp_ngrams(tokens, n):
    if len(tokens) < n:
        return set()
    
    ngrams = set()
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams.add(ngram)
    
    return ngrams

processed_documents_sastrawi = {}
processed_documents_porter = {}

for _, row in corpus.iterrows():
    doc_id = row['doc_id']
    processed_documents_sastrawi[doc_id] = row['abstrak_sastrawi'].split()
    processed_documents_porter[doc_id] = row['abstrak_porter'].split()

def calculate_dice_similarity(query_text, processed_documents, n_gram_size=3, threshold=0.0):
    """
    Menghitung Dice Coefficient menggunakan Rabin-Karp n-grams
    
    Args:
        query_text: Teks query
        processed_documents: Dictionary dokumen yang sudah diproses
        n_gram_size: Ukuran n-gram (default: 3)
        threshold: Nilai minimum similarity untuk dianggap mirip (default: 0.0)
    """
    query_ngrams = rabin_karp_ngrams(query_text.split(), n_gram_size)
    similarities = []
    
    for doc_id, doc_words in processed_documents.items():
        doc_ngrams = rabin_karp_ngrams(doc_words, n_gram_size)
        common = query_ngrams & doc_ngrams
        dice_coefficient = (2 * len(common)) / (len(query_ngrams) + len(doc_ngrams)) if (len(query_ngrams) + len(doc_ngrams)) > 0 else 0.0
        dice_coefficient = round(dice_coefficient, 4)  # Round to 4 decimal places
        
        if dice_coefficient > threshold:
            doc_id = int(doc_id)
            similarities.append((doc_id, dice_coefficient))
    
    return sorted(similarities, key=lambda x: x[1], reverse=True)

# ========== RABIN-KARP dengan DICE COEFFICIENT ==========
# Threshold untuk Dice Coefficient: 0.1 (10% kesamaan n-gram)
DICE_THRESHOLD = 0.001

query_similarities_sastrawi = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_sastrawi']
    similarities = calculate_dice_similarity(query_text, processed_documents_sastrawi, threshold=DICE_THRESHOLD)
    query_similarities_sastrawi.append({
        'query_id': query_id,
        'similarities': similarities
    })

query_similarities_porter = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_porter']
    similarities = calculate_dice_similarity(query_text, processed_documents_porter, threshold=DICE_THRESHOLD)
    query_similarities_porter.append({
        'query_id': query_id,
        'similarities': similarities
    })

combined_results = []

for sastrawi_result, porter_result in zip(query_similarities_sastrawi, query_similarities_porter):
    query_id = sastrawi_result['query_id']  # atau porter_result['query_id'], sama saja
    porter_sims = ', '.join([f"({doc_id})" for doc_id, score in porter_result['similarities']])
    sastrawi_sims = ', '.join([f"({doc_id})" for doc_id, score in sastrawi_result['similarities']])
    
    combined_results.append({
        'id_kueri': query_id,
        'rk_porter': porter_sims,
        'rk_sastrawi': sastrawi_sims
    })

rk_porter_results = []
for result in query_similarities_porter:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_porter_results.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

rk_sastrawi_results = []
for result in query_similarities_sastrawi:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_sastrawi_results.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

rk_porter_df = pd.DataFrame(rk_porter_results)
rk_sastrawi_df = pd.DataFrame(rk_sastrawi_results)
similarity_table = pd.DataFrame(combined_results)
similarity_table['expert'] = gold_standard['dokumen_yang_mirip'].tolist()

with pd.ExcelWriter('RK_similarity_results.xlsx', engine='openpyxl') as writer:
    similarity_table.to_excel(writer, sheet_name='Results RK', index=False)
    rk_porter_df.to_excel(writer, sheet_name='RK Porter Detailed', index=False)
    rk_sastrawi_df.to_excel(writer, sheet_name='RK Sastrawi Detailed', index=False)

In [40]:
# Import sklearn untuk Cosine Similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [41]:
def calculate_cosine_similarity(query_text, processed_documents, threshold=0.0):
    """
    Menghitung Cosine Similarity menggunakan TF-IDF
    
    Args:
        query_text: Teks query
        processed_documents: Dictionary dokumen yang sudah diproses
        threshold: Nilai minimum similarity untuk dianggap mirip (default: 0.0)
    """
    # Gabungkan semua dokumen menjadi list
    documents = []
    doc_ids = []
    
    for doc_id, doc_words in processed_documents.items():
        documents.append(' '.join(doc_words))
        doc_ids.append(doc_id)
    
    # Tambahkan query ke dalam documents
    all_texts = [query_text] + documents
    
    # Buat TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Hitung cosine similarity antara query (index 0) dengan semua dokumen
    query_vector = tfidf_matrix[0:1]  # Query vector
    doc_vectors = tfidf_matrix[1:]    # Document vectors
    
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    
    # Buat hasil similarity
    results = []
    for i, similarity in enumerate(similarities):
        if similarity > threshold:
            doc_id = int(doc_ids[i])
            similarity = round(similarity, 4)
            results.append((doc_id, similarity))
    
    return sorted(results, key=lambda x: x[1], reverse=True)

In [42]:
# ========== COSINE SIMILARITY dengan TF-IDF ==========
# Threshold untuk Cosine Similarity: 0.05 (5% kesamaan cosine)
COSINE_THRESHOLD = 0.05

query_similarities_sastrawi_cosine = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_sastrawi']
    similarities = calculate_cosine_similarity(query_text, processed_documents_sastrawi, threshold=COSINE_THRESHOLD)
    query_similarities_sastrawi_cosine.append({
        'query_id': query_id,
        'similarities': similarities
    })

query_similarities_porter_cosine = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_porter']
    similarities = calculate_cosine_similarity(query_text, processed_documents_porter, threshold=COSINE_THRESHOLD)
    query_similarities_porter_cosine.append({
        'query_id': query_id,
        'similarities': similarities
    })

combined_results_cosine = []

for sastrawi_result, porter_result in zip(query_similarities_sastrawi_cosine, query_similarities_porter_cosine):
    query_id = sastrawi_result['query_id']
    porter_sims = ', '.join([f"({doc_id})" for doc_id, score in porter_result['similarities']])
    sastrawi_sims = ', '.join([f"({doc_id})" for doc_id, score in sastrawi_result['similarities']])
    
    combined_results_cosine.append({
        'id_kueri': query_id,
        'cosine_porter': porter_sims,
        'cosine_sastrawi': sastrawi_sims
    })

rk_porter_results_cosine = []
for result in query_similarities_porter_cosine:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_porter_results_cosine.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

rk_sastrawi_results_cosine = []
for result in query_similarities_sastrawi_cosine:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_sastrawi_results_cosine.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

cosine_porter_df = pd.DataFrame(rk_porter_results_cosine)
cosine_sastrawi_df = pd.DataFrame(rk_sastrawi_results_cosine)
similarity_table_cosine = pd.DataFrame(combined_results_cosine)
similarity_table_cosine['expert'] = gold_standard['dokumen_yang_mirip'].tolist()

with pd.ExcelWriter('Cosine_similarity_results.xlsx', engine='openpyxl') as writer:
    similarity_table_cosine.to_excel(writer, sheet_name='Results Cosine', index=False)
    cosine_porter_df.to_excel(writer, sheet_name='Cosine Porter Detailed', index=False)
    cosine_sastrawi_df.to_excel(writer, sheet_name='Cosine Sastrawi Detailed', index=False)


In [43]:
# ========== ANALISIS PERBANDINGAN METODE ==========

print("=== KONFIGURASI THRESHOLD ===")
print(f"Threshold Rabin-Karp (Dice Coefficient): {DICE_THRESHOLD} ({DICE_THRESHOLD*100}%)")
print(f"Threshold Cosine Similarity: {COSINE_THRESHOLD} ({COSINE_THRESHOLD*100}%)")
print("Note: Threshold berbeda karena karakteristik metrik yang berbeda")
print()

print("=== RINGKASAN HASIL ===")
print(f"Total query: {len(query)}")
print(f"Total dokumen dalam corpus: {len(corpus)}")
print()

# Analisis hasil RK (Dice Coefficient)
print("--- RABIN-KARP dengan DICE COEFFICIENT ---")
print(f"RK Sastrawi - Total hasil: {len(rk_sastrawi_results)}")
print(f"RK Porter - Total hasil: {len(rk_porter_results)}")

# Analisis distribusi nilai Dice
if len(rk_sastrawi_results) > 0:
    dice_values = [result['similarity'] for result in rk_sastrawi_results]
    print(f"  Nilai Dice Sastrawi - Min: {min(dice_values):.4f}, Max: {max(dice_values):.4f}, Avg: {np.mean(dice_values):.4f}")

if len(rk_porter_results) > 0:
    dice_values = [result['similarity'] for result in rk_porter_results]
    print(f"  Nilai Dice Porter - Min: {min(dice_values):.4f}, Max: {max(dice_values):.4f}, Avg: {np.mean(dice_values):.4f}")
print()

# Analisis hasil Cosine Similarity
print("--- COSINE SIMILARITY dengan TF-IDF ---")
print(f"Cosine Sastrawi - Total hasil: {len(rk_sastrawi_results_cosine)}")
print(f"Cosine Porter - Total hasil: {len(rk_porter_results_cosine)}")

# Analisis distribusi nilai Cosine
if len(rk_sastrawi_results_cosine) > 0:
    cosine_values = [result['similarity'] for result in rk_sastrawi_results_cosine]
    print(f"  Nilai Cosine Sastrawi - Min: {min(cosine_values):.4f}, Max: {max(cosine_values):.4f}, Avg: {np.mean(cosine_values):.4f}")

if len(rk_porter_results_cosine) > 0:
    cosine_values = [result['similarity'] for result in rk_porter_results_cosine]
    print(f"  Nilai Cosine Porter - Min: {min(cosine_values):.4f}, Max: {max(cosine_values):.4f}, Avg: {np.mean(cosine_values):.4f}")
print()

# Contoh perbandingan untuk query pertama
if len(query_similarities_sastrawi) > 0 and len(query_similarities_sastrawi_cosine) > 0:
    query_id = query_similarities_sastrawi[0]['query_id']
    print(f"=== CONTOH PERBANDINGAN UNTUK QUERY ID: {query_id} ===")
    
    print("RK Sastrawi (Dice Coefficient):")
    for doc_id, score in query_similarities_sastrawi[0]['similarities'][:5]:
        print(f"  Doc {doc_id}: {score}")
    
    print("\nCosine Sastrawi (TF-IDF):")
    for doc_id, score in query_similarities_sastrawi_cosine[0]['similarities'][:5]:
        print(f"  Doc {doc_id}: {score}")
    
    print("\nRK Porter (Dice Coefficient):")
    for doc_id, score in query_similarities_porter[0]['similarities'][:5]:
        print(f"  Doc {doc_id}: {score}")
    
    print("\nCosine Porter (TF-IDF):")
    for doc_id, score in query_similarities_porter_cosine[0]['similarities'][:5]:
        print(f"  Doc {doc_id}: {score}")

print("\n=== REKOMENDASI THRESHOLD ===")
print("1. Dice Coefficient (0.1-0.3): Konservatif, fokus pada kesamaan n-gram yang jelas")
print("2. Cosine Similarity (0.05-0.2): Lebih liberal, menangkap kesamaan semantik")
print("3. Sesuaikan threshold berdasarkan analisis distribusi nilai di atas")

print("\n=== FILE OUTPUT YANG DIHASILKAN ===")
print("1. RK_similarity_results.xlsx - Hasil Rabin-Karp dengan Dice Coefficient")
print("2. Cosine_similarity_results.xlsx - Hasil Cosine Similarity dengan TF-IDF")

=== KONFIGURASI THRESHOLD ===
Threshold Rabin-Karp (Dice Coefficient): 0.001 (0.1%)
Threshold Cosine Similarity: 0.05 (5.0%)
Note: Threshold berbeda karena karakteristik metrik yang berbeda

=== RINGKASAN HASIL ===
Total query: 10
Total dokumen dalam corpus: 90

--- RABIN-KARP dengan DICE COEFFICIENT ---
RK Sastrawi - Total hasil: 158
RK Porter - Total hasil: 153
  Nilai Dice Sastrawi - Min: 0.0057, Max: 0.0535, Avg: 0.0135
  Nilai Dice Porter - Min: 0.0057, Max: 0.0535, Avg: 0.0139

--- COSINE SIMILARITY dengan TF-IDF ---
Cosine Sastrawi - Total hasil: 409
Cosine Porter - Total hasil: 338
  Nilai Cosine Sastrawi - Min: 0.0500, Max: 0.4306, Avg: 0.0932
  Nilai Cosine Porter - Min: 0.0502, Max: 0.3993, Avg: 0.0921

=== CONTOH PERBANDINGAN UNTUK QUERY ID: 1 ===
RK Sastrawi (Dice Coefficient):
  Doc 22: 0.0358
  Doc 13: 0.0197
  Doc 4: 0.0139
  Doc 38: 0.0081
  Doc 90: 0.0078

Cosine Sastrawi (TF-IDF):
  Doc 41: 0.3767
  Doc 22: 0.1693
  Doc 18: 0.1278
  Doc 12: 0.1255
  Doc 13: 0.1231

R