In [20]:
import mlflow
from lib.constants import PROJECT_DIR, EXPERIMENT_NAME, MLFLOW_URI
from lib.dataset import load_train_data, load_test_data

# Make sure to have the MLFlow server on before running this code.
# mlflow.set_tracking_uri(uri=MLFLOW_URI)
# experiment = mlflow.set_experiment(EXPERIMENT_NAME)
X_train, y_train = load_train_data()
X_test = load_test_data()

In [16]:
import pandas as pd
from lib.sklearn.preprocess import nlp
from sklearn.pipeline import Pipeline

custom_map = {
    row['asal']: row['tujuan']
    for _, row in pd.read_csv('custom-mapper.csv').iterrows()
}
preprocess_pipeline = Pipeline([
    ('tokenizer', nlp.TextTokenizer()),
    ('formalizer', nlp.WordsFormalizer()),
    ('custom_mapper', nlp.WordsMapper(custom_map)),
    ('lemmatization', nlp.WordsLemmatization()),
    ('special_char_filter', nlp.SpecialCharacterFilter()),
    # ('stop_words_filter', nlp.StopWordsFilter()),
    ('unknown_words_filter', nlp.UnknownWordsFilter())
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)
X_train_transformed[:3]

[['layan', 'adalah', 'tidak', 'sahabat', 'person', 'malam', 'jaga', 'gelas'],
 ['kakak',
  'enak',
  'sangat',
  'layan',
  'cepat',
  'tanggap',
  'dan',
  'yang',
  'pertama',
  'murah',
  'senyum'],
 ['layan', 'sangat', 'ramah', 'banyak', 'promosi']]

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

predictor_pipeline = Pipeline([
    ('token_to_text', nlp.TokenToTextTransformer()),
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('classifier', SGDClassifier())
])

model_pipeline = Pipeline([
    ('preprocessor', preprocess_pipeline),
    ('predictor', predictor_pipeline)
])

model_pipeline.fit(X_train, y_train)
model_pipeline.score(X_train, y_train)

0.9822616407982262

In [21]:
predictions = model_pipeline.predict(X_test)
submission = pd.DataFrame({'ID': np.arange(len(predictions)), 'LABEL': predictions})
display(submission)
submission.to_csv('test_submission.csv', index=False)

Unnamed: 0,ID,LABEL
0,0,5
1,1,5
2,2,5
3,3,1
4,4,3
...,...,...
495,495,3
496,496,5
497,497,4
498,498,4
