In [1]:
import mlflow
from lib.constants import EXPERIMENT_NAME, MLFLOW_URI
from lib.dataset import load_train_data, load_test_data

# Make sure to have the MLFlow server on before running this code.
mlflow.set_tracking_uri(uri=MLFLOW_URI)
experiment = mlflow.set_experiment(EXPERIMENT_NAME)
train_data = load_train_data()
test_data = load_test_data()

In [2]:
from lib.sklearn.preprocess import nlp

column = 'REVIEW'

tokenizer = nlp.TextTokenizer(column)
tokenized_train_data = tokenizer.transform(train_data)

formalizer = nlp.WordsFormalizer(column)
formalized_train_data = formalizer.transform(tokenized_train_data)

lemmatization = nlp.WordsLemmatization(column)
lemmatization_train_data = lemmatization.transform(formalized_train_data)

stop_words_filter = nlp.StopWordsFilter(column)
stop_words_filtered_train_data = stop_words_filter.transform(lemmatization_train_data)

unknown_words_filter = nlp.UnknownWordsFilter(column)
unknown_words_filtered_train_data = unknown_words_filter.transform(stop_words_filtered_train_data)

transformed_train_data = unknown_words_filtered_train_data
transformed_train_data

Unnamed: 0,ID,REVIEW,LABEL
0,0,"[not, person]",1
1,1,"[kakak, enak, layan, cepat, tanggap, murah, se...",5
2,2,"[layan, ramah]",5
3,3,"[nyaman, bersih]",5
4,4,"[layan, cepat, bagus, coba, durian, mantap]",5
...,...,...,...
897,897,"[jujur, pas, masuk, restoran, suasana, nyaman,...",5
898,898,"[makan, enak, layan, ramah, bagus, deh, suka, ...",5
899,899,"[area, buka, cocok, kalo, tongkrong, teman]",5
900,900,"[salah, favorit, layan, ramah, luas, sedia, ar...",5


In [7]:
words = set()
for tokens in stop_words_filtered_train_data['REVIEW']:
    words = words.union(set(tokens))

unknown_words = nlp.extract_unknown_words(words)
len(unknown_words)

1539