In [2]:
import json
import pandas as pd
import string
import random
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from thefuzz import fuzz, process

In [3]:
# inisiasi sastrawi Bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [4]:
excel_file_path = 'dataset_kuesioner.xlsx'
df = pd.read_excel(excel_file_path)

In [5]:
id = []
for i in range(len(df)):
    code_length = 16
    unit_id = ''.join(random.choices(string.ascii_letters + string.digits, k=code_length))
    id.append(unit_id)

df['form_id'] = id

In [6]:
json_data = df.to_json()
with open('title.json', 'w') as json_file:
    json_file.write(json_data)

In [7]:
with open('title.json', 'r') as json_file:
    json_data = json.load(json_file)

In [8]:
with open('indonesian-wordlist.lst', 'r') as file:
    lines = file.readlines()
    indonesian_word_list = [line.strip() for line in lines]

In [9]:
with open('combined_stop_words.txt') as file:
    stop_words = set(file.read().split())

In [10]:
def remove_stop_words(text):
    words = text.split()
    filtered_words = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_words.append(word)
    return ' '.join(filtered_words)

In [11]:
def handle_typos(text):
    words = text.split()
    handled_typos = []
    for word in words:
        if word in indonesian_word_list:
            handled_typos.append(word)
        else:
            ratios = process.extract(word, indonesian_word_list, scorer=fuzz.ratio)
            highest_ratio = max(ratios, key=lambda x: x[1])
            highest_ratio_word = highest_ratio[0]
            handled_typos.append(highest_ratio_word)

    return ' '.join(handled_typos)


In [12]:
def preprocess_text_query(text):

    text = str(text)
    text = text.lower()
    text = text.translate(str.maketrans(
        dict.fromkeys(string.punctuation, ' ')))

    text = handle_typos(text)
    text = stemmer.stem(text)

    text = remove_stop_words(text)

    return text

In [13]:
def preprocess_data(text):

    text = str(text)
    text = text.lower()
    text = text.translate(str.maketrans(
        dict.fromkeys(string.punctuation, ' ')))

    text = stemmer.stem(text)
    text = remove_stop_words(text)

    return text

In [17]:
def search(query):
    title = list(json_data["judul"].values())
    form_id = list(json_data["form_id"].values())
    results = []

    query_preprocessed = preprocess_text_query(query)
    for i in range(len(title)):
        title_preprocessed = preprocess_data(title[i])
        if query_preprocessed in title_preprocessed:
            hasil = {
                "id":  form_id[i],
                "title": title[i]
            }
            results.append(json.dumps(hasil))
        i += 1
    
    query_preprocessed = preprocess_text_query(query).split()
    for word in query_preprocessed:
        for i in range(len(title)):
            title_preprocessed = preprocess_data(title[i])
            if word in title_preprocessed:
                if title not in results:
                    hasil = {
                        "id":  form_id[i],
                        "title": title[i]
                    }
                    results.append(json.dumps(hasil))
        i += 1

    return results

In [18]:
input = "dampak kebijakan"
hasil = search(input)

['{"id": "E87qYCXkCYQ7EK0M", "title": "Analisis Keuangan Rumah Tangga terkait Pelayanan Kesehatan: Dampak dan Strategi Pengelolaan di Tengah Pandemi COVID-19"}', '{"id": "sjdnlvnjqjbxEbwk", "title": "Evaluasi Dampak Ekonomi Beban Penyakit terhadap Keuangan Rumah Tangga: Studi Kasus pada Keluarga dengan Anggota yang Menderita Penyakit Kronis"}', '{"id": "R9LnuURgFmvAafqW", "title": "Dampak Pariwisata Medis terhadap Infrastruktur Kesehatan Lokal: Studi Kasus pada Destinasi Pariwisata Medis di Indonesia"}', '{"id": "cAxpCRe0iKbkA9Kn", "title": "Dampak Program Kesehatan Masyarakat dalam Meningkatkan Kesejahteraan Sosial: Studi Kasus pada Program Pemberdayaan Masyarakat di Daerah Terpencil"}', '{"id": "xUmuPak42YWhJOTj", "title": "Kesadaran Masyarakat tentang Dampak Lingkungan terhadap Kesehatan: Survei pada Penduduk di Wilayah Konservasi Alam"}', '{"id": "PVGGVUuxJDLMAPRl", "title": "Penggunaan Aplikasi Kesehatan Mobile dalam Meningkatkan Perilaku Hidup Sehat: Penilaian Penggunaan dan Damp