In [1]:
import mlflow
from lib.constants import EXPERIMENT_NAME, MLFLOW_URI
from lib.dataset import load_train_data, load_test_data

# Make sure to have the MLFlow server on before running this code.
# mlflow.set_tracking_uri(uri=MLFLOW_URI)
# experiment = mlflow.set_experiment(EXPERIMENT_NAME)
X_train, y_train = load_train_data()
X_test = load_test_data()

In [3]:
import numpy as np
import pandas as pd

from lib.sklearn.preprocess import nlp

X = X_train + X_test

tokenizer = nlp.TextTokenizer()
X_tokenized = tokenizer.transform(X)

formalizer = nlp.WordsFormalizer()
X_formalized = formalizer.transform(X_tokenized)

custom_map = {
    row['asal']: row['tujuan']
    for _, row in pd.read_csv('custom-mapper.csv').iterrows()
}
custom_mapper = nlp.WordsMapper(custom_map)
X_custom_mapped = custom_mapper.transform(X_formalized)

lemmatization = nlp.WordsLemmatization()
X_lemmatization = lemmatization.transform(X_custom_mapped)

special_char_filter = nlp.SpecialCharacterFilter()
X_special_char_filtered = special_char_filter.transform(X_lemmatization)

# stop_words_filter = nlp.StopWordsFilter()
# X_stop_words_filtered = stop_words_filter.transform(X_special_char_filtered)
X_stop_words_filtered = X_special_char_filtered

unknown_words_filter = nlp.UnknownWordsFilter()
X_unknown_words_filtered = unknown_words_filter.transform(X_stop_words_filtered)

X_transformed = X_unknown_words_filtered
X_transformed

[['layan', 'adalah', 'tidak', 'sahabat', 'person', 'malam', 'jaga', 'gelas'],
 ['kakak',
  'enak',
  'sangat',
  'layan',
  'cepat',
  'tanggap',
  'dan',
  'yang',
  'pertama',
  'murah',
  'senyum'],
 ['layan', 'sangat', 'ramah', 'banyak', 'promosi'],
 ['tempat', 'nyaman', 'dan', 'bersih'],
 ['layan',
  'cepat',
  'dan',
  'bagus',
  'coba',
  'durian',
  'yang',
  'baru',
  'mantap'],
 ['enak', 'nyaman'],
 ['cepat', 'ramah', 'baik'],
 ['layan',
  'lama',
  'semrawut',
  'sani',
  'pesan',
  'langgan',
  'yang',
  'order',
  'kalah',
  'cepat',
  'dengan',
  'yang',
  'order',
  'belakang',
  'ayo',
  'semangat',
  'benah',
  'untuk',
  'lebih',
  'baik',
  'lagi'],
 ['tempat',
  'paling',
  'baik',
  'buat',
  'tugas',
  'layan',
  'ramah',
  'cuma',
  'wastafel',
  'lantai',
  'tidak',
  'bisa',
  'nyala',
  'cara',
  'seluruh',
  'semua',
  'bagus'],
 ['layan', 'ramah', 'mutu', 'selalu', 'stabil', 'rasa', 'kopi'],
 ['banyak', 'banget'],
 ['makas', 'mantap'],
 ['layan',
  'super',


In [5]:
words = set()
for tokens in X_stop_words_filtered:
    words = words.union(set(tokens))

unknown_words = nlp.extract_unknown_words(words)
len(unknown_words)

1346

In [9]:
(pd.DataFrame({'X': X, 'X_stop_words_filtered': X_stop_words_filtered})
 .assign(
   len_words=lambda df: df.X_stop_words_filtered.map(len),
   unknown_words=lambda df: df.X_stop_words_filtered.map(lambda words: nlp.extract_unknown_words(set(words))),
   len_unknown_words=lambda df: df.unknown_words.map(len)
 )
#  .loc[lambda df: df.len_unknown_words.gt(0)]
)

Unnamed: 0,X,X_stop_words_filtered,len_words,unknown_words,len_unknown_words
0,"The waiter is not friendly, the person wears n...","[the, layan, adalah, tidak, sahabat, the, pers...",11,"[wears, the]",2
1,"kakaknya enak bangetttt, pelayanannya cepet ta...","[kakak, enak, sangat, layan, cepat, tanggap, d...",11,[],0
2,"Pelayanan sangatt ramah, banyak promoanyaa","[layan, sangat, ramah, banyak, promosi]",5,[],0
3,Tempatnya nyaman dan bersih,"[tempat, nyaman, dan, bersih]",4,[],0
4,"Pelayanan cepat dan bagus, coba Durian Coffee ...","[layan, cepat, dan, bagus, coba, durian, coffe...",10,[coffee],1
...,...,...,...,...,...
1397,"Iki yakopo seh, titik lokasimu kok nng jl ahma...","[iki, yakopo, seh, titik, lokasi, kok, nng, ja...",10,"[yakopo, iki, yani, nng, ahmad]",5
1398,"Tempatnya enak, deket pintu keluar. Kalo menu ...","[tempat, enak, dekat, pintu, keluar, kalo, men...",13,[excelso],1
1399,Pelayannya agak nyebelin sih,"[layan, agak, sebal, sih]",4,[],0
1400,"Makanan nya untuk saya kurang cocok, hanya pis...","[makan, nya, untuk, saya, kurang, cocok, hanya...",19,"[nya, security]",2
