In [1]:
pip install pandas==2.0.3 ijson==3.2.3 numpy==1.26.4 Sastrawi==1.0.1

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os, time
from typing import List
import pandas as pd
import numpy as np

# using ijson to handle large JSON without running out of memory
# https://pythonspeed.com/articles/json-memory-streaming/
# https://github.com/ICRAR/ijson
import ijson

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
def get_docs(doc_data: pd.DataFrame, result: List[dict[str, int]]) -> pd.DataFrame:
    out = []
    for index, row in doc_data.iterrows():
        for dictionary in result:
            if str(row['id']) == dictionary['doc_id']:
                out.append(row)
    return pd.DataFrame(data=out)

def search(logical_operator: str, queries: List[str], model_path: str) -> List[List[dict[str, int]]]:
    """
        Will return each List[dict, int] for each query string.
        So, final search would be List[List[dict, int]] which contains all of queries result.
    """
    # {"docid1": 3, "docid1": 4, "docid2": 1}
    result = {}

    with open(model_path) as f:
        # dict[str, dict[str, List[dict[str, int]]]]
        data = ijson.kvitems(f, '')

        # If word1 OR word2 OR wordN is exist, just append to the result.
        if logical_operator == 'OR':
            for word, documents in data:
                for query in queries:
                    if word == query:
                        for document in documents:
                            for docid, score in document.items():
                                if docid in result:
                                    result[docid] = max(score, result[docid])
                                else:
                                    result[docid] = score

        # word1 AND word2 AND wordN must exist in the same document id.
        elif logical_operator == 'AND':
            all_doc_ids = []
            doc_id_score = {}
            for word, documents in data:
                for query in queries:
                    if word == query:
                        current_doc_ids = []
                        for document in documents:
                            for docid, score in document.items():
                                current_doc_ids.append(docid)
                                if docid in doc_id_score:
                                    doc_id_score[docid] = max(score, doc_id_score[docid])
                                else:
                                    doc_id_score[docid] = score

                        if len(all_doc_ids) <= 0:
                            all_doc_ids = current_doc_ids
                        all_doc_ids = np.intersect1d(all_doc_ids, current_doc_ids)

            for docid in all_doc_ids:
                result[docid] = doc_id_score[docid]
        
        # word1 AND word2 AND wordN must not exist in the same document id.
        # for example if you have:
        #   document a: aku, dia
        #   document b: kamu, mereka
        #   document c: kami, kita
        # then if we search NOT = [aku, kamu]
        # it should return only document c, because document a contains 'aku' and document b contains 'kamu'
        elif logical_operator == 'NOT':
            docids_per_word = {}
            doc_id_score = {}
            for word, documents in data:
                for document in documents:
                    for docid, score in document.items():
                        if word not in docids_per_word:
                            docids_per_word[word] = [docid]
                        else:
                            docids_per_word[word].append(docid)

                        # get maximum score
                        if docid in doc_id_score:
                            doc_id_score[docid] = max(score, doc_id_score[docid])
                        else:
                            doc_id_score[docid] = score

            contaminated_doc_ids = []
            for q in queries:
                for word in docids_per_word.keys():
                    for docid in docids_per_word[word]:
                        if q == word:
                            contaminated_doc_ids.append(docid)
                if q in docids_per_word:
                    del docids_per_word[q]

            contaminated_doc_ids = set(contaminated_doc_ids)
            for word in docids_per_word.keys():
                for docid in docids_per_word[word]:
                    if docid not in contaminated_doc_ids:
                        result[docid] = doc_id_score[docid]

    return sorted(result.items(), key=lambda item: item[1], reverse=True)


ab_query = ['a', 'b']
and_ab_result = search('AND', ab_query, f"{os.getcwd()}/word_map_test.json")
print(f"AND {ab_query} => {and_ab_result} {and_ab_result == [('doc1', 4)]}")

or_ab_result = search('OR', ab_query, f"{os.getcwd()}/word_map_test.json")
print(f"OR  {ab_query} => {or_ab_result} {or_ab_result == [('doc3', 10), ('doc1', 4), ('doc2', 2)]}")

not_ab_result = search('NOT', ab_query, f"{os.getcwd()}/word_map_test.json")
print(f"NOT {ab_query} => {not_ab_result} {not_ab_result == [('doc4', 10)]}")


AND ['a', 'b'] => [('doc1', 4)] True
OR  ['a', 'b'] => [('doc3', 10), ('doc1', 4), ('doc2', 2)] True
NOT ['a', 'b'] => [('doc4', 10)] True


In [4]:
def do_queries(doc_data: pd.DataFrame, model_path: str, logical_operator: str, queries_collection: List[List[str]], stemming: bool=False, k: int=5):
    for queries in queries_collection:
        print("=======================================================================================")
        start = time.time()
        print(f"Query                    : {f' {logical_operator} '.join(queries)} \n")

        if stemming:
            # create stemmer
            step_start = time.time()
            factory = StemmerFactory()
            stemmer = factory.create_stemmer()
            new_queries = []
            for q in queries:
                q = stemmer.stem(q)
                new_queries.append(q)
            queries = new_queries
            print(f"Time to stemming         : {(time.time() - step_start):.06f} seconds => {f' {logical_operator} '.join(queries)}")

        

        step_start = time.time()
        query_result = search(logical_operator=logical_operator, queries=queries, model_path=model_path)
        print(f"Time to search           : {(time.time() - step_start):.06f} seconds")

        step_start = time.time()
        keys = ["doc_id", "value"]
        list_of_dicts = [{keys[0]: doc_id, keys[1]: value} for doc_id, value in query_result]
        list_of_dicts = list_of_dicts[:k]
        print(f"Convert to list of dicts : {(time.time() - step_start):.06f} seconds")

        step_start = time.time()
        doc_result = get_docs(doc_data=doc_data, result=list_of_dicts)
        print(f"Time to get {k} docs       : {(time.time() - step_start):.06f} seconds")

        print(f"------------------------------------------")
        print(f"Total time taken         : {(time.time() - start):.06f} seconds\n")


        print(f"Found: {list_of_dicts}\n")
        
        # .to_string(index=False, max_colwidth=50)
        print(doc_result.to_string())
        print("=======================================================================================\n\n")


In [5]:
base_dir = os.getcwd()
df = pd.read_csv(f'{os.getcwd()}/news.csv')

# AND queries
and_query1 = ["pemerintah", "korupsi"]
and_query2 = ["jalan", "rusak"]

# OR queries
or_query1 = ["pemerintah", "korupsi"]
or_query2 = ["jalan", "rusak"]

# NOT queries
not_query1 = ["pemerintah", "korupsi"]
not_query2 = ["jalan", "rusak"]

# ALL WORDS are included and NOT doing STEMMING

In [6]:
word_map_not_stemmed_all_word_path = os.path.join(base_dir, 'word_map_not_stemmed_all_word.json')

do_queries(doc_data=df, model_path=word_map_not_stemmed_all_word_path, logical_operator='AND', queries_collection=[and_query1, and_query2])
do_queries(doc_data=df, model_path=word_map_not_stemmed_all_word_path, logical_operator='OR', queries_collection=[or_query1, or_query2])
do_queries(doc_data=df, model_path=word_map_not_stemmed_all_word_path, logical_operator='NOT', queries_collection=[not_query1, not_query2])

Query                    : pemerintah AND korupsi 

Time to search           : 0.868529 seconds
Convert to list of dicts : 0.000015 seconds
Time to get 5 docs       : 0.228780 seconds
------------------------------------------
Total time taken         : 1.097487 seconds

Found: [{'doc_id': '467', 'value': 44}, {'doc_id': '11063', 'value': 24}, {'doc_id': '5765', 'value': 21}, {'doc_id': '8442', 'value': 19}, {'doc_id': '10030', 'value': 18}]

          id  id_author                                                                            title        portal                        time                  author                editor                                                                                                                                                                                                                                                                                                                                                                         

# STEMMING is done but NOT REMOVING STOPWORDS

In [7]:
word_map_stemmed_not_stopword_path = os.path.join(base_dir, 'word_map_stemmed_not_stopword.json')

do_queries(doc_data=df, model_path=word_map_stemmed_not_stopword_path, logical_operator='AND', queries_collection=[and_query1, and_query2], stemming=True)
do_queries(doc_data=df, model_path=word_map_stemmed_not_stopword_path, logical_operator='OR', queries_collection=[or_query1, or_query2], stemming=True)
do_queries(doc_data=df, model_path=word_map_stemmed_not_stopword_path, logical_operator='NOT', queries_collection=[not_query1, not_query2], stemming=True)

Query                    : pemerintah AND korupsi 

Time to stemming         : 0.073066 seconds => perintah AND korupsi
Time to search           : 0.517297 seconds
Convert to list of dicts : 0.000025 seconds
Time to get 5 docs       : 0.224877 seconds
------------------------------------------
Total time taken         : 0.815436 seconds

Found: [{'doc_id': '467', 'value': 44}, {'doc_id': '10030', 'value': 43}, {'doc_id': '8442', 'value': 32}, {'doc_id': '11063', 'value': 24}, {'doc_id': '5765', 'value': 21}]

          id  id_author                                                                            title        portal                        time                  author                editor                                                                                                                                                                                                                                                                                                     

# STEMMING WORD AND REMOVE STOPWORDS

In [8]:
word_map_stemmed_all_word_path = os.path.join(base_dir, 'word_map_stemmed_all_word.json')

do_queries(doc_data=df, model_path=word_map_stemmed_all_word_path, logical_operator='AND', queries_collection=[and_query1, and_query2], stemming=True)
do_queries(doc_data=df, model_path=word_map_stemmed_all_word_path, logical_operator='OR', queries_collection=[or_query1, or_query2], stemming=True)
do_queries(doc_data=df, model_path=word_map_stemmed_all_word_path, logical_operator='NOT', queries_collection=[not_query1, not_query2], stemming=True)

Query                    : pemerintah AND korupsi 

Time to stemming         : 0.074332 seconds => perintah AND korupsi
Time to search           : 0.800644 seconds
Convert to list of dicts : 0.000021 seconds
Time to get 5 docs       : 0.223965 seconds
------------------------------------------
Total time taken         : 1.099131 seconds

Found: [{'doc_id': '467', 'value': 44}, {'doc_id': '10030', 'value': 43}, {'doc_id': '8442', 'value': 32}, {'doc_id': '11063', 'value': 24}, {'doc_id': '5765', 'value': 21}]

          id  id_author                                                                            title        portal                        time                  author                editor                                                                                                                                                                                                                                                                                                     