In [1]:
pip install pandas==2.0.3 Sastrawi==1.0.1 nltk==3.8.1 numpy==1.26.4 tqdm==4.66.4

Note: you may need to restart the kernel to use updated packages.


In [2]:
import logging, json, os, re, string, time
from typing import List
from tqdm import tqdm
import pandas as pd
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
# Download NLTK corpus
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yusufs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yusufs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# https://stackoverflow.com/a/42328068
# write log into file, so we can trace it on which document it already processed

log_filename = f"{os.getcwd()}/progress.log"
logging.basicConfig(filename=log_filename,
                    format='%(levelname)s - %(asctime)s - %(name)s - %(message)s',
                    filemode='w',
                    level=logging.INFO)
file_handler = logging.FileHandler(log_filename)
formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(name)s : %(message)s')
file_handler.setFormatter(formatter)

logger = logging.getLogger()
logger.addHandler(file_handler)

In [5]:
df = pd.read_csv(f'{os.getcwd()}/news.csv')

df.head()

Unnamed: 0,id,id_author,title,portal,time,author,editor,content,source
0,0,1,Infografis Pekerja Asing Dilarang Masuk Wilaya...,Liputan6.com,"24 Jul 2021, 09:02 WIB",Abdillah,Abdillah,Pemerintah melalui Menteri Hukum dan Hak Asasi...,https://www.liputan6.com/news/read/4614451/inf...
1,1,1,Infografis Jadwal Bulu Tangkis Indonesia di Ol...,Liputan6.com,"23 Jul 2021, 23:23 WIB",Abdillah,Abdillah,Bulu Tangkis menjadi andalan Indonesia berburu...,https://www.liputan6.com/bola/read/4614427/inf...
2,2,1,"Infografis Jangan Bebal, Kamu Tidak Kebal Covi...",Liputan6.com,"23 Jul 2021, 10:40 WIB",Abdillah,Abdillah,Covid-19 tidak mengenal usia dan status. Siapa...,https://www.liputan6.com/news/read/4613233/inf...
3,3,1,Infografis Awas Perokok Lebih Rentan Tertular ...,Liputan6.com,"22 Jul 2021, 10:35 WIB",Abdillah,Abdillah,Kebiasaan merokok berisiko menimbulkan sejumla...,https://www.liputan6.com/news/read/4612324/inf...
4,4,1,Infografis Perbedaan Aturan PPKM Level 3 dan 4,Liputan6.com,"22 Jul 2021, 09:01 WIB",Abdillah,Abdillah,Pemberlakuan Pembatasan Kegiatan Masyarakat at...,https://www.liputan6.com/news/read/4612511/inf...


In [6]:
# Check how many rows
print(f"Contains {len(df)} rows of documents with data type {type(df)}.")

Contains 14343 rows of documents with data type <class 'pandas.core.frame.DataFrame'>.


In [7]:
def remove_tweet_special(text: str) -> str:
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub('([@#][A-Za-z0-9]+)|(\\w+:\\S+)'," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
#remove number
def remove_number(text: str) -> str:
    return  re.sub(r"\d+", "", text)

#remove punctuation
def remove_punctuation(text: str) -> str:
    return text.translate(str.maketrans("", "", string.punctuation))

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text: str) -> str:
    return re.sub('\\s+',' ',text)

# remove single char
def remove_single_char(text: str) -> str:
    return re.sub(r"\b[a-zA-Z]\b", "", text)


In [8]:
def sort_by_doc_freq(data: dict[str, dict[str, int]]) -> dict[str, dict[str, List[dict[str, int]]]]:
    # convert: {"pemerintah": {"0": 1},"dan": {"1": 1, "0": 3}}
    # into   : {"pemerintah": [{"0": 1}],"dan": [{"0": 3}, {"1": 1}]}
    output_data = {}
    for key, value in data.items():
        temp_dict = []
        for k, v in value.items():
            temp_dict.append({k: v})
        output_data[key] = sorted(temp_dict, key=lambda x: list(x.values())[0], reverse=True)
    return output_data

In [9]:
def inverted_index_sequential(data: pd.core.frame.DataFrame) -> dict:
  """
  Creates an inverted index for a 'id' and 'content' column in a pandas DataFrame.

  Args:
      data: The pandas DataFrame containing the id and content text data.

  Returns:
      * A dictionary representing the inverted index.
      * A dictionary containing the document content.
  """

  start = time.time()

  # create stemmer
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()

  # get stopword indonesia
  list_stopwords = stopwords.words('indonesian')
  list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
                         'kalo', 'amp', 'biar', 'bikin', 'bilang',
                         'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                         'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                         'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                         'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                         '&amp', 'yah'])

  # convert list to dictionary
  list_stopwords = set(list_stopwords)

  document_content_map = {}
  word_map_not_stemmed_all_word = {}
  word_map_stemmed_not_stopword = {}
  word_map_stemmed_all_word = {}
  elapsed_time_total = 0
  for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    start_per_doc = time.time()
    count = index + 1  

    id = int(row['id'])
    content = row['content']

    document_content_map[id] = content
    text = str(content).lower().split()  # Case folding and split words
    for word in text:
      word = word.strip() # remove prefix and suffix space

      # tokenizing
      logger.info(f"{((count/len(data))/100):.2f}% Document {id} tokenizing word: {word}")
      word = remove_tweet_special(text=word)
      word = remove_number(text=word)
      word = remove_punctuation(text=word)
      word = remove_whitespace_multiple(text=word)
      word = remove_single_char(text=word)
      word = word.strip()
      logger.info(f"{((count/len(data))/100):.2f}% Document {id} finish tokenizing word: {word}")

      if word == "":
          continue

      # save not stemming, unchecked whether it is in stopwords of not
      id_map_not_stem_all_word = {}
      if word in word_map_not_stemmed_all_word:
        id_map_not_stem_all_word = word_map_not_stemmed_all_word[word]
      
      if id in id_map_not_stem_all_word:
        id_map_not_stem_all_word[id] = id_map_not_stem_all_word[id] + 1
      else:
        id_map_not_stem_all_word[id] = 1

      word_map_not_stemmed_all_word[word] = id_map_not_stem_all_word

      logger.info(f"{((count/len(data))/100):.2f}% Document {id} stemming word: {word}")
      stemmed_word = stemmer.stem(word)
      logger.info(f"{((count/len(data))/100):.2f}% Document {id} finish stemming word: {word} -> {stemmed_word}")
      word = stemmed_word
        
      # save stemming + not in stop words
      if word not in list_stopwords:
        id_map = {}
        if word in word_map_stemmed_not_stopword:
          id_map = word_map_stemmed_not_stopword[word]

        if id in id_map:
          id_map[id] = id_map[id] + 1
        else:
          id_map[id] = 1

        word_map_stemmed_not_stopword[word] = id_map

      # save all stemmed words, even if it contains stop words
      id_map_stemmed_all = {}
      if word in word_map_stemmed_all_word:
        id_map_stemmed_all = word_map_stemmed_all_word[word]

      if id in id_map_stemmed_all:
        id_map_stemmed_all[id] = id_map_stemmed_all[id] + 1
      else:
        id_map_stemmed_all[id] = 1

      word_map_stemmed_all_word[word] = id_map_stemmed_all

    end_per_doc = time.time()
    elapsed_time_total = elapsed_time_total + (end_per_doc - start_per_doc)

  word_map_not_stemmed_all_word = sort_by_doc_freq(word_map_not_stemmed_all_word)
  word_map_stemmed_not_stopword = sort_by_doc_freq(word_map_stemmed_not_stopword)
  word_map_stemmed_all_word = sort_by_doc_freq(word_map_stemmed_all_word)

  end = time.time()
  elapsed_time = end - start
  print(f"Total time to index: {elapsed_time:.6f} seconds, avg: {(elapsed_time/len(document_content_map)):.6f} seconds/doc")
  return {
      'document_content_map': document_content_map,
      'word_map_not_stemmed_all_word': word_map_not_stemmed_all_word, 
      'word_map_stemmed_not_stopword': word_map_stemmed_not_stopword, 
      'word_map_stemmed_all_word': word_map_stemmed_all_word,
  }

In [10]:
# Do process
result = inverted_index_sequential(data=df)

document_content_map = result['document_content_map']
word_map_not_stemmed_all_word = result['word_map_not_stemmed_all_word']
word_map_stemmed_not_stopword = result['word_map_stemmed_not_stopword']
word_map_stemmed_all_word = result['word_map_stemmed_all_word']


print(f"Success indexing {len(document_content_map)} documents")

base_dir = os.getcwd()

with open(f"{base_dir}/document_content_map.json", "w") as f:
  json_string = json.dumps(document_content_map)
  f.write(json_string)
  print(f"Success write {base_dir}/document_content_map.json ({len(document_content_map)})")

with open(f"{base_dir}/word_map_not_stemmed_all_word.json", "w") as f:
  json_string = json.dumps(word_map_not_stemmed_all_word)
  f.write(json_string)
  print(f"Success write {base_dir}/word_map_not_stemmed_all_word.json ({len(word_map_not_stemmed_all_word)})")

with open(f"{base_dir}/word_map_stemmed_not_stopword.json", "w") as f:
  json_string = json.dumps(word_map_stemmed_not_stopword)
  f.write(json_string)
  print(f"Success write {base_dir}/word_map_stemmed_not_stopword.json ({len(word_map_stemmed_not_stopword)})")

with open(f"{base_dir}/word_map_stemmed_all_word.json", "w") as f:
  json_string = json.dumps(word_map_stemmed_all_word)
  f.write(json_string)
  print(f"Success write {base_dir}/word_map_stemmed_all_word.json ({len(word_map_stemmed_all_word)})")


100%|██████████| 14343/14343 [1:35:58<00:00,  2.49it/s]  


Total time to index: 5761.201422 seconds, avg: 0.401673 seconds/doc
Success indexing 14343 documents
Success write /Users/yusufs/Magister-Ilmu-Komputer/materi-kuliah/semester-3/information-retrieval/20240504-boolean-retrieval/document_content_map.json (14343)
Success write /Users/yusufs/Magister-Ilmu-Komputer/materi-kuliah/semester-3/information-retrieval/20240504-boolean-retrieval/word_map_not_stemmed_all_word.json (92778)
Success write /Users/yusufs/Magister-Ilmu-Komputer/materi-kuliah/semester-3/information-retrieval/20240504-boolean-retrieval/word_map_stemmed_not_stopword.json (73256)
Success write /Users/yusufs/Magister-Ilmu-Komputer/materi-kuliah/semester-3/information-retrieval/20240504-boolean-retrieval/word_map_stemmed_all_word.json (73573)
