In [46]:
import os
import re
from collections import Counter
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

print("SASTRAWI STEMMER")

folder_path = r"C:\Users\Sultan Daris\Downloads\UAS TKI Projek\DokumenAbstrak\Dokumen" 

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            words = content.split()
            first_10_words = ' '.join(words[:3])
            print(f"file {filename}:\n{first_10_words}\n")

processed_documents_sastrawi = {}
stemmer = StemmerFactory().create_stemmer()

for filename in os.listdir(folder_path):
    print(f"Preprocessing file: {filename}")
    file_path = os.path.join(folder_path, filename)

    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
                    
            content = content.lower()
            
            sentences = re.split(r'[.!?]+', content)
            
            content = re.sub(r'[^\w\s]', ' ', content)
            
            content = re.sub(r'\s+', ' ', content).strip()
            
            words = content.split()
            
            stop_words = {'dan', 'atau', 'yang', 'adalah', 'ini', 'itu', 'dengan', 'untuk', 'pada', 'dalam', 'dari', 'ke', 'di', 'akan', 'dapat', 'juga', 'tidak', 'ada', 'satu', 'dua', 'tiga'}
            filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
            
            stemmed_words = [stemmer.stem(word) for word in filtered_words]
            
            processed_documents_sastrawi[filename] = stemmed_words
            
            segmented_lines = []
            for i in range(0, len(stemmed_words), 7):
                line = ' '.join(stemmed_words[i:i+7])
                segmented_lines.append(line)
            
            segmented_output = '\n'.join(segmented_lines)
            
            output_filename = f"preprocessed_{filename}"
            output_path = os.path.join(r"C:\Users\Sultan Daris\Downloads\UAS TKI Projek\DokumenAbstrak\DokumenSastrawi", output_filename)
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(segmented_output)
            
           
            print(f"Original words: {len(words)}")
            print(f"After preprocessing: {len(filtered_words)}")
            print(f"After stemming: {len(stemmed_words)}")
            unique_words = len(set(stemmed_words))
            print(f"Unique words: {unique_words}")
            print(f"Saved to: {output_filename}")
            print("\n")


SASTRAWI STEMMER
file Docs1.txt:
Penyakit jantung merupakan

file Docs2.txt:
Perkembangan industri video

file Docs3.txt:
Sulitnya penyedia layanan

file Docs4.txt:
Sulitnya penyedia layanan

Preprocessing file: Docs1.txt
Original words: 265
After preprocessing: 193
After stemming: 193
Unique words: 97
Saved to: preprocessed_Docs1.txt


Preprocessing file: Docs2.txt
Original words: 152
After preprocessing: 122
After stemming: 122
Unique words: 70
Saved to: preprocessed_Docs2.txt


Preprocessing file: Docs3.txt
Original words: 174
After preprocessing: 141
After stemming: 141
Unique words: 90
Saved to: preprocessed_Docs3.txt


Preprocessing file: Docs4.txt
Original words: 174
After preprocessing: 141
After stemming: 141
Unique words: 90
Saved to: preprocessed_Docs4.txt




In [47]:
import nltk
from nltk.stem import PorterStemmer

nltk.download('punkt', quiet=True)

print("PORTER STEMMER")

stemmer = PorterStemmer()

processed_documents_porter = {}

for filename in os.listdir(folder_path):
    print(f"Preprocessing file: {filename}")
    file_path = os.path.join(folder_path, filename)

    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
                    
            content = content.lower()
            
            sentences = re.split(r'[.!?]+', content)
            
            content = re.sub(r'[^\w\s]', ' ', content)
            
            content = re.sub(r'\s+', ' ', content).strip()
            
            words = content.split()
            
            stop_words = {'dan', 'atau', 'yang', 'adalah', 'ini', 'itu', 'dengan', 'untuk', 'pada', 'dalam', 'dari', 'ke', 'di', 'akan', 'dapat', 'juga', 'tidak', 'ada', 'satu', 'dua', 'tiga'}
            filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
            
            stemmed_words = [stemmer.stem(word) for word in filtered_words]
            
            processed_documents_porter[filename] = stemmed_words
            
            segmented_lines = []
            for i in range(0, len(stemmed_words), 7):
                line = ' '.join(stemmed_words[i:i+7])
                segmented_lines.append(line)
            
            segmented_output = '\n'.join(segmented_lines)
            
            output_filename = f"preprocessed_{filename}"
            output_path = os.path.join(r"C:\Users\Sultan Daris\Downloads\UAS TKI Projek\DokumenAbstrak\DokumenPorter", output_filename)
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(segmented_output)
            
            print(f"Original words: {len(words)}")
            print(f"After preprocessing: {len(filtered_words)}")
            print(f"After Porter stemming: {len(stemmed_words)}")
            unique_words = len(set(stemmed_words))
            print(f"Unique words: {unique_words}")
            print(f"Saved to: {output_filename}")
            print("\n")


PORTER STEMMER
Preprocessing file: Docs1.txt
Original words: 265
After preprocessing: 193
After Porter stemming: 193
Unique words: 104
Saved to: preprocessed_Docs1.txt


Preprocessing file: Docs2.txt
Original words: 152
After preprocessing: 122
After Porter stemming: 122
Unique words: 76
Saved to: preprocessed_Docs2.txt


Preprocessing file: Docs3.txt
Original words: 174
After preprocessing: 141
After Porter stemming: 141
Unique words: 95
Saved to: preprocessed_Docs3.txt


Preprocessing file: Docs4.txt
Original words: 174
After preprocessing: 141
After Porter stemming: 141
Unique words: 95
Saved to: preprocessed_Docs4.txt




In [53]:
def rabin_karp_ngrams(words, n, base=256, mod=101):
    hashes = set()
    if len(words) < n:
        return hashes
    h = 0
    high_order = pow(base, n-1, mod)
    
    for i in range(n):
        h = (h * base + ord(words[i][0])) % mod
    hashes.add(h)

    for i in range(1, len(words) - n + 1):
        h = (h - ord(words[i-1][0]) * high_order) % mod
        h = (h * base + ord(words[i+n-1][0])) % mod
        hashes.add(h)
    return hashes

doc_scores_sastrawi = {doc: [] for doc in processed_documents_sastrawi}
doc_names_sastrawi = list(processed_documents_sastrawi.keys())
ngram_hashes_sastrawi = {doc: rabin_karp_ngrams(processed_documents_sastrawi[doc], n=3) for doc in doc_names_sastrawi}

doc_scores_porter = {doc: [] for doc in processed_documents_porter}
doc_names_porter = list(processed_documents_porter.keys())
ngram_hashes_porter = {doc: rabin_karp_ngrams(processed_documents_porter[doc], n=3) for doc in doc_names_porter}

for i in range(len(doc_names_sastrawi)):
    for j in range(i+1, len(doc_names_sastrawi)):
        doc1, doc2 = doc_names_sastrawi[i], doc_names_sastrawi[j]
        common = ngram_hashes_sastrawi[doc1] & ngram_hashes_sastrawi[doc2]
        total = ngram_hashes_sastrawi[doc1] | ngram_hashes_sastrawi[doc2]
        similarity = len(common) / len(total) if total else 0
        doc_scores_sastrawi[doc1].append(similarity)
        doc_scores_sastrawi[doc2].append(similarity)
        print(f"Similarity Score (Sastrawi) antara {doc1} and {doc2}: {similarity:.4f}")
print("\n")

for i in range(len(doc_names_porter)):
    for j in range(i+1, len(doc_names_porter)):
        doc1, doc2 = doc_names_porter[i], doc_names_porter[j]
        common = ngram_hashes_porter[doc1] & ngram_hashes_porter[doc2]
        total = ngram_hashes_porter[doc1] | ngram_hashes_porter[doc2]
        similarity = len(common) / len(total) if total else 0
        doc_scores_porter[doc1].append(similarity)
        doc_scores_porter[doc2].append(similarity)
        print(f"Similarity Score (Porter) antara {doc1} and {doc2}: {similarity:.4f}")
print("\n")

print("=== SASTRAWI vs PORTER STEMMING ===\n")

print("SASTRAWI STEMMING:")
for doc, scores in doc_scores_sastrawi.items():
    mean_score = sum(scores) / len(scores) if scores else 0
    print(f"Mean similarity score for {doc}: {mean_score:.4f}")

print("\nPORTER STEMMING:")
for doc, scores in doc_scores_porter.items():
    mean_score = sum(scores) / len(scores) if scores else 0
    print(f"similarity score untuk {doc}: {mean_score:.4f}")

print("\nPERBANDINGAN:")
print(f"{'Document':<12} {'Sastrawi':<10} {'Porter':<10} {'Difference':<10}")
print("-" * 45)
for doc in doc_names_sastrawi:
    sastrawi_score = sum(doc_scores_sastrawi[doc]) / len(doc_scores_sastrawi[doc]) if doc_scores_sastrawi[doc] else 0
    porter_score = sum(doc_scores_porter[doc]) / len(doc_scores_porter[doc]) if doc_scores_porter[doc] else 0
    difference = abs(sastrawi_score - porter_score)
    print(f"{doc:<12} {sastrawi_score:<10.4f} {porter_score:<10.4f} {difference:<10.4f}")


Similarity Score (Sastrawi) antara Docs1.txt and Docs2.txt: 0.6044
Similarity Score (Sastrawi) antara Docs1.txt and Docs3.txt: 0.6064
Similarity Score (Sastrawi) antara Docs1.txt and Docs4.txt: 0.6064
Similarity Score (Sastrawi) antara Docs2.txt and Docs3.txt: 0.4574
Similarity Score (Sastrawi) antara Docs2.txt and Docs4.txt: 0.4574
Similarity Score (Sastrawi) antara Docs3.txt and Docs4.txt: 1.0000


Similarity Score (Porter) antara Docs1.txt and Docs2.txt: 0.6105
Similarity Score (Porter) antara Docs1.txt and Docs3.txt: 0.6344
Similarity Score (Porter) antara Docs1.txt and Docs4.txt: 0.6344
Similarity Score (Porter) antara Docs2.txt and Docs3.txt: 0.5543
Similarity Score (Porter) antara Docs2.txt and Docs4.txt: 0.5543
Similarity Score (Porter) antara Docs3.txt and Docs4.txt: 1.0000


=== SASTRAWI vs PORTER STEMMING ===

SASTRAWI STEMMING:
Mean similarity score for Docs1.txt: 0.6057
Mean similarity score for Docs2.txt: 0.5064
Mean similarity score for Docs3.txt: 0.6879
Mean similarity 