In [1]:
import pandas as pd
import sys
import time
import string
from preprocessor import (
    clean_word, stemmer, pipe, remove_punctuation,
    normalize_money, normalize_number, normalize_weekday,
    normalize_month, create_stemmer, create_stop_words_remover
)
from functools import reduce
import gensim

# Filter

In [2]:
is_real_sentence = lambda s: (
    len(s) > 10 and
    len(s.split()) > 10 and
    s[0] in string.ascii_letters  and 
    '|' not in s and 
    '=' not in s and 
    'html' not in s and 
    ':' not in s and
    '/' not in s
)

In [3]:
annotated_words = '''asing atas badan baru berat 
besar bidang bintang bisa buah 
bulan bunga cabang cerah coklat 
dalam dasar dunia halaman harapan 
jalan jam jaringan kabur kaki 
kali kepala ketat kulit kunci 
layar lebat lingkungan mata membawa 
memecahkan menangkap mendorong menerima mengandung 
mengejar mengeluarkan mengikat mengisi menjaga 
menurunkan menyusun nilai panas pembagian 
rapat sarung tengah tinggi '''.split()

In [4]:
exception_words = [w for w in list(map(stemmer.stem, annotated_words)) if w not in annotated_words]

# Raw Data

In [5]:
begin = time.perf_counter()
raw = open('../idwiki-latest-pages-articles-full.xml', 'r').readlines()
print('elapsed time:', time.perf_counter() - begin)
WIKI_LEN = len(raw)

elapsed time: 38.5666457


# List and Clean Sentences

In [6]:
preprocess_sentence = pipe(
    remove_punctuation,
    normalize_money,
    normalize_number,
    normalize_weekday,
    normalize_month,
    create_stemmer(annotated_words, exception_words),
    create_stop_words_remover(annotated_words, exception_words),
)

In [7]:
clean_sentences = []
begin = time.perf_counter()
max_time = 1800

for i in range(WIKI_LEN):
    
    sentences = []
    for s in raw[i].split('.'):
        sentences.append(s + '.')
    for s in sentences:
        if not is_real_sentence(s):
            continue
        s = s.replace('[[', '').replace(']]', '').replace('\n', '').replace("''", "").replace("'''", "").replace('&quot;', '')
        clean_sentences.append(preprocess_sentence(s))
    if i % (WIKI_LEN//5000) == 0:
        elapsed = int(time.perf_counter() - begin)
        sys.stdout.write("\rRaw read: {0:.2f} % | Instances collected: {1}| Time elapsed: {2} s | Time left: {3} s".format(
            i/WIKI_LEN*100, len(clean_sentences), elapsed, max_time - elapsed
        ))
        sys.stdout.flush()
        if elapsed > max_time:
            print()
            print('Time\'s up, raw lines read: ', i)
            break

Raw read: 21.32 % | Instances collected: 161221| Time elapsed: 1801 s | Time left: -1 ss
Time's up, raw lines read:  15023138


In [8]:
f = open('../wikipedia_clean_sentences.txt', 'w')
for s in clean_sentences:
    f.write(s + '\n')
f.close()

# Load Processed Sentences

In [2]:
clean_sentences = open('../wikipedia_clean_sentences.txt', 'r').readlines()

In [3]:
tokenized_clean_sentences = []
begin = time.perf_counter()
for i in range(len(clean_sentences)):
    tokenized_clean_sentences.append(clean_sentences[i].split())  
    if i % (1000) == 0:
        sys.stdout.write("\r{0:.2f} | Time elapsed: {1} s".format(
            i/len(clean_sentences), int(time.perf_counter() - begin)
        ))
        sys.stdout.flush()

1.00 | Time elapsed: 1 s

In [29]:
EMBEDDING_SIZE = 37

In [30]:
begin = time.perf_counter()
embedding_model = gensim.models.Word2Vec(tokenized_clean_sentences, window=5, size=EMBEDDING_SIZE, workers=7)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 14.553489300000365


In [31]:
embedding_model.wv.save_word2vec_format('../wikipedia_indonesia_embedding{}.model'.format(EMBEDDING_SIZE))

In [28]:
embedding_model.wv.most_similar(positive='kompas')

[('meebo', 0.8299260139465332),
 ('minggu', 0.8268102407455444),
 ('kamis', 0.8045620918273926),
 ('revisi', 0.7995588183403015),
 ('sabtu', 0.7949458360671997),
 ('koran', 0.7927308082580566),
 ('tanggal', 0.7919483184814453),
 ('bulan', 0.7799400091171265),
 ('gapeka', 0.7795543074607849),
 ('senin', 0.76075679063797)]