In [2]:
import mlflow
from lib.constants import EXPERIMENT_NAME, MLFLOW_URI
from lib.dataset import load_train_data, load_test_data

# Make sure to have the MLFlow server on before running this code.
mlflow.set_tracking_uri(uri=MLFLOW_URI)
experiment = mlflow.set_experiment(EXPERIMENT_NAME)
train_data = load_train_data()
test_data = load_test_data()

2024/03/02 19:02:50 INFO mlflow.tracking.fluent: Experiment with name 'ml-olympiad-tfugsurabaya-2024' does not exist. Creating a new experiment.


## Download necessary language models

In [46]:
import nltk

nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/yoiqsram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yoiqsram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yoiqsram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yoiqsram/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## NLTK example usage on English

### Tokenize text

In [21]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = train_data.REVIEW.iloc[0]

sentences = sent_tokenize(text)
print(sentences)

words = word_tokenize(text)
print(words)

['The waiter is not friendly, the person wears night guard glasses']
['The', 'waiter', 'is', 'not', 'friendly', ',', 'the', 'person', 'wears', 'night', 'guard', 'glasses']


### Filter stop words

In [9]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print(filtered_words)

['waiter', 'friendly', ',', 'person', 'wears', 'night', 'guard', 'glasses']


### Stemming & Lemmatization

In [15]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in filtered_words]
print(stemmed_words)

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print(lemmatized_words)

['waiter', 'friendli', ',', 'person', 'wear', 'night', 'guard', 'glass']
['waiter', 'friendly', ',', 'person', 'wear', 'night', 'guard', 'glass']


### Part-of-Speech Tagging

In [20]:
from nltk import pos_tag

pos_tags = pos_tag(lemmatized_words)
print(pos_tags)

[('waiter', 'NN'), ('friendly', 'RB'), (',', ','), ('person', 'NN'), ('wear', 'JJ'), ('night', 'NN'), ('guard', 'NN'), ('glass', 'NN')]


## Example of usage in Bahasa

In [25]:
from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

text = train_data.REVIEW.iloc[1]

sentences = sent_tokenize(text)
words = word_tokenize(text)
print(words)

# Filter stop words
stop_words = set(stopwords.words('indonesian'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print(filtered_words)

# Stemming
porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in filtered_words]
print(stemmed_words)

# Lemmatization (using Sastrawi library)
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stemmed_words_sastrawi = [stemmer.stem(word) for word in filtered_words]
print(stemmed_words_sastrawi)

['kakaknya', 'enak', 'bangetttt', ',', 'pelayanannya', 'cepet', 'tanggap', 'dan', 'yang', 'pertama', 'murah', 'senyum']
['kakaknya', 'enak', 'bangetttt', ',', 'pelayanannya', 'cepet', 'tanggap', 'murah', 'senyum']
['kakaknya', 'enak', 'bangetttt', ',', 'pelayanannya', 'cepet', 'tanggap', 'murah', 'senyum']
['kakak', 'enak', 'bangetttt', '', 'layan', 'cepet', 'tanggap', 'murah', 'senyum']


In [68]:
import pandas as pd
from lib.constants import PROJECT_DIR

collex = pd.read_table(PROJECT_DIR / 'data' / 'indo-collex' / 'informal_to_formal_dictionary.tsv')
collex

Unnamed: 0,informal,formal
0,0kmh,0 kmh
1,1007mb,1007 mb
2,1008mb,1008 mb
3,1009mb,1009 mb
4,100k,100 ribu
...,...,...
2618,yuuk,ayo
2619,yuukk,yuk
2620,yuuu,ayo
2621,yuuuk,ayo


In [88]:
collex_dict = {row['informal']: row['formal'] for _, row in collex.iloc[:10].iterrows()}
collex_dict

{'0kmh': '0 kmh',
 '1007mb': '1007 mb',
 '1008mb': '1008 mb',
 '1009mb': '1009 mb',
 '100k': '100 ribu',
 '1010mb': '1010 mb',
 '1011mb': '1011 mb',
 '1012mb': '1012 mb',
 '1017mb': '1017 mb',
 '1018mb': '1018 mb'}

In [114]:
with open(PROJECT_DIR / 'data' / 'sastrawi' / 'kata-dasar.txt') as f:
    words = set(f.read().splitlines())

print(len(words))

29932
