In [None]:
import pandas as pd
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

df = pd.read_csv('dataset.csv')
query = pd.concat([df.iloc[:, 0], df.iloc[:, 2]], axis=1)
gold_standard = pd.concat([df.iloc[:, 0], df.iloc[:, 3]], axis=1)

print(query)

corpus = pd.read_csv('corpus.csv')
corpus = pd.concat([corpus.iloc[:, 0], corpus.iloc[:, 2]], axis=1)
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = word_tokenize(text)
    
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    return ' '.join(tokens)

corpus['abstrak'] = corpus.iloc[:, 1].apply(preprocess_text)
query['abstrak'] = query.iloc[:, 1].apply(preprocess_text)

stemmer = StemmerFactory().create_stemmer()
def stem_text(text):
    return stemmer.stem(text)

corpus['abstrak_sastrawi'] = corpus['abstrak'].apply(stem_text)
query['abstrak_sastrawi'] = query['abstrak'].apply(stem_text)


   doc_id_query                                            abstrak
0             1  Masalah stunting di Indonesia merupakan ancama...
1             2  Systemic Lupus Erytemathosus (SLE) merupakan p...
2             3  Emosi merupakan respons reflektif dari pengala...
3             4  Pemberian kredit adalah salah satu layanan uta...
4             5  Advertorial terselubung atau unlabeled adverto...
5             6  Diabetes melitus (DM) atau diabetes adalah pen...
6             7  Seleksi Nasional Berdasarkan Tes (SNBT) merupa...
7             8  Gangguan penglihatan memiliki prevalensi yang ...
8             9  Pengendalian persediaan merupakan faktor penti...
9            10  Deepfake adalah teknologi yang menggunakan kec...


In [4]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

def porter_stem_text(text):
    tokens = text.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

corpus['abstrak_porter'] = corpus['abstrak'].apply(porter_stem_text)
query['abstrak_porter'] = query['abstrak'].apply(porter_stem_text)

In [None]:
def rabin_karp_ngrams(tokens, n):
    if len(tokens) < n:
        return set()
    
    ngrams = set()
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngrams.add(ngram)
    
    return ngrams

processed_documents_sastrawi = {}
processed_documents_porter = {}

for _, row in corpus.iterrows():
    doc_id = row['doc_id']
    processed_documents_sastrawi[doc_id] = row['abstrak_sastrawi'].split()
    processed_documents_porter[doc_id] = row['abstrak_porter'].split()
    def calculate_query_similarity(query_text, processed_documents, n_gram_size=3):
        query_ngrams = rabin_karp_ngrams(query_text.split(), n_gram_size)
        similarities = []
        
        for doc_id, doc_words in processed_documents.items():
            doc_ngrams = rabin_karp_ngrams(doc_words, n_gram_size)
            common = query_ngrams & doc_ngrams
            union = query_ngrams | doc_ngrams
            dice_coefficient = (2 * len(common)) / (len(query_ngrams) + len(doc_ngrams)) if (len(query_ngrams) + len(doc_ngrams)) > 0 else 0.0
            dice_coefficient = round(dice_coefficient, 4)  # Round to 4 decimal places
            
            if dice_coefficient > 0:
                doc_id = int(doc_id)
                similarities.append((doc_id, dice_coefficient))
        
        return sorted(similarities, key=lambda x: x[1], reverse=True)

query_similarities_sastrawi = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_sastrawi']
    similarities = calculate_query_similarity(query_text, processed_documents_sastrawi)
    query_similarities_sastrawi.append({
        'query_id': query_id,
        'similarities': similarities
    })

query_similarities_porter = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_porter']
    similarities = calculate_query_similarity(query_text, processed_documents_porter)
    query_similarities_porter.append({
        'query_id': query_id,
        'similarities': similarities
    })

combined_results = []

for sastrawi_result, porter_result in zip(query_similarities_sastrawi, query_similarities_porter):
    query_id = sastrawi_result['query_id']  # atau porter_result['query_id'], sama saja
    porter_sims = ', '.join([f"({doc_id})" for doc_id, score in porter_result['similarities']])
    sastrawi_sims = ', '.join([f"({doc_id})" for doc_id, score in sastrawi_result['similarities']])
    
    combined_results.append({
        'id_kueri': query_id,
        'rk_porter': porter_sims,
        'rk_sastrawi': sastrawi_sims
    })

rk_porter_results = []
for result in query_similarities_porter:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_porter_results.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

rk_sastrawi_results = []
for result in query_similarities_sastrawi:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_sastrawi_results.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

rk_porter_df = pd.DataFrame(rk_porter_results)
rk_sastrawi_df = pd.DataFrame(rk_sastrawi_results)
similarity_table = pd.DataFrame(combined_results)
similarity_table['expert'] = gold_standard['dokumen_yang_mirip'].tolist()

with pd.ExcelWriter('similarity_results.xlsx', engine='openpyxl') as writer:
    similarity_table.to_excel(writer, sheet_name='Results RK', index=False)
    rk_porter_df.to_excel(writer, sheet_name='RK Porter Detailed', index=False)
    rk_sastrawi_df.to_excel(writer, sheet_name='RK Sastrawi Detailed', index=False)

In [62]:
query_similarities_sastrawi_cosine = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_sastrawi']
    similarities = calculate_query_similarity(query_text, processed_documents_sastrawi)
    query_similarities_sastrawi_cosine.append({
        'query_id': query_id,
        'similarities': similarities
    })

query_similarities_porter_cosine = []
for _, query_row in query.iterrows():
    query_id = query_row['doc_id_query']
    query_text = query_row['abstrak_porter']
    similarities = calculate_query_similarity(query_text, processed_documents_porter)
    query_similarities_porter_cosine.append({
        'query_id': query_id,
        'similarities': similarities
    })

combined_results_cosine = []

for sastrawi_result, porter_result in zip(query_similarities_sastrawi_cosine, query_similarities_porter_cosine):
    query_id = sastrawi_result['query_id']
    porter_sims = ', '.join([f"({doc_id})" for doc_id, score in porter_result['similarities']])
    sastrawi_sims = ', '.join([f"({doc_id})" for doc_id, score in sastrawi_result['similarities']])
    
    combined_results_cosine.append({
        'id_kueri': query_id,
        'rk_porter_cosine': porter_sims,
        'rk_sastrawi_cosine': sastrawi_sims
    })

rk_porter_results_cosine = []
for result in query_similarities_porter_cosine:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_porter_results_cosine.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

rk_sastrawi_results_cosine = []
for result in query_similarities_sastrawi_cosine:
    query_id = result['query_id']
    for doc_id, similarity in result['similarities']:
        rk_sastrawi_results_cosine.append({
            'query_id': query_id,
            'doc_id': doc_id,
            'similarity': similarity
        })

rk_porter_cosine_df = pd.DataFrame(rk_porter_results_cosine)
rk_sastrawi_cosine_df = pd.DataFrame(rk_sastrawi_results_cosine)
similarity_table_cosine = pd.DataFrame(combined_results_cosine)
similarity_table_cosine['expert'] = gold_standard['dokumen_yang_mirip'].tolist()

with pd.ExcelWriter('Cosine_similarity_results.xlsx', engine='openpyxl') as writer:
    similarity_table_cosine.to_excel(writer, sheet_name='Results Cosine', index=False)
    rk_porter_cosine_df.to_excel(writer, sheet_name='Porter Cosine Detailed', index=False)
    rk_sastrawi_cosine_df.to_excel(writer, sheet_name='Sastrawi Cosine Detailed', index=False)
