In [1]:
import mlflow
from lib.constants import PROJECT_DIR, EXPERIMENT_NAME, MLFLOW_URI
from lib.dataset import load_train_data, load_test_data

# Make sure to have the MLFlow server on before running this code.
mlflow.set_tracking_uri(uri=MLFLOW_URI)
experiment = mlflow.set_experiment(EXPERIMENT_NAME)
X_train, y_train = load_train_data()
X_test = load_test_data()

## Preprocessing

### Basic NLP Preprocessing

In [2]:
import pandas as pd
from lib.sklearn.preprocess import nlp
from sklearn.pipeline import Pipeline

custom_map = {
    row['asal']: row['tujuan']
    for _, row in pd.read_csv('custom-mapper.csv').iterrows()
}
preprocess_pipeline = Pipeline([
    ('tokenizer', nlp.TextTokenizer()),
    ('formalizer', nlp.WordsFormalizer()),
    ('custom_mapper', nlp.WordsMapper(custom_map)),
    ('lemmatization', nlp.WordsLemmatization()),
    ('special_char_filter', nlp.SpecialCharacterFilter()),
    # ('stop_words_filter', nlp.StopWordsFilter()),
    ('unknown_words_filter', nlp.UnknownWordsFilter())
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)
X_train_transformed[:3]

[['layan', 'adalah', 'tidak', 'sahabat', 'person', 'malam', 'jaga', 'gelas'],
 ['kakak',
  'enak',
  'sangat',
  'layan',
  'cepat',
  'tanggap',
  'dan',
  'yang',
  'pertama',
  'murah',
  'senyum'],
 ['layan', 'sangat', 'ramah', 'banyak', 'promosi']]

### Label distribution

In [3]:
import numpy as np

y_counts = {
    label: count
    for label, count in zip(*np.unique(y_train, return_counts=True))
}
display(y_counts)

y_props = {
    label: count / len(y_train)
    for label, count in y_counts.items()
}
display(y_props)

y_weight = {
    label: 1 / count
    for label, count in y_counts.items()
}
display(y_weight)

{1: 157, 2: 41, 3: 46, 4: 101, 5: 557}

{1: 0.17405764966740578,
 2: 0.045454545454545456,
 3: 0.050997782705099776,
 4: 0.11197339246119734,
 5: 0.6175166297117517}

{1: 0.006369426751592357,
 2: 0.024390243902439025,
 3: 0.021739130434782608,
 4: 0.009900990099009901,
 5: 0.0017953321364452424}

### Transformation and Modeling

#### SGDClassifier

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RepeatedKFold, cross_val_score

repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=99)

control_params = {
    'max_iter': 2500,
    'penalty': 'l2',
    'shuffle': True,
    'random_state': 99
}
var_params_options = [
    {'loss': 'hinge', 'learning_rate': 'optimal'},
    {'loss': 'log_loss', 'learning_rate': 'optimal'},
    {'loss': 'modified_huber', 'learning_rate': 'optimal'},
    {'loss': 'perceptron', 'learning_rate': 'optimal'},
    {'loss': 'hinge', 'learning_rate': 'adaptive', 'eta0': 0.01},
    {'loss': 'log_loss', 'learning_rate': 'adaptive', 'eta0': 0.01},
    {'loss': 'modified_huber', 'learning_rate': 'adaptive', 'eta0': 0.01},
    {'loss': 'perceptron', 'learning_rate': 'adaptive', 'eta0': 0.01},
]

for var_params in var_params_options:
    params = control_params.copy()
    params.update(var_params)

    predictor_pipeline = Pipeline([
        ('token_to_text', nlp.TokenToTextTransformer()),
        ('tfidf_vectorizer', TfidfVectorizer()),
        ('classifier', SGDClassifier(**params))
    ])
    model_pipeline = Pipeline([
        ('preprocessor', preprocess_pipeline),
        ('predictor', predictor_pipeline)
    ])

    with mlflow.start_run() as run:
        params['model'] = 'SGDClassifier'
        mlflow.log_params(params)

        scores = cross_val_score(
            model_pipeline,
            X_train,
            y_train,
            cv=repeated_kfold,
            scoring='f1_macro',
            n_jobs=-1
        )
        score = np.mean(scores)
        mlflow.log_metric('f1_macro', score)

        model_pipeline.fit(X_train, y_train)
        mlflow.sklearn.log_model(model_pipeline, 'model')

        last_run_id = run.info.run_id

#### Random Forest Classifier

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold, cross_val_score

repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=99)

control_params = {
    'random_state': 99,
    'n_jobs': -1
}
var_params_options = [
    {'n_estimators': 50, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 100, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 150, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 200, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 50, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 100, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 150, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 200, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 50, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 100, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 150, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 200, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 50, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 100, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 150, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 200, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
]

for var_params in var_params_options:
    params = control_params.copy()
    params.update(var_params)

    predictor_pipeline = Pipeline([
        ('token_to_text', nlp.TokenToTextTransformer()),
        ('tfidf_vectorizer', TfidfVectorizer()),
        ('classifier', RandomForestClassifier(**params))
    ])
    model_pipeline = Pipeline([
        ('preprocessor', preprocess_pipeline),
        ('predictor', predictor_pipeline)
    ])

    with mlflow.start_run() as run:
        params['model'] = 'RandomForestClassifier'
        mlflow.log_params(params)

        scores = cross_val_score(
            model_pipeline,
            X_train,
            y_train,
            cv=repeated_kfold,
            scoring='f1_macro',
            n_jobs=-1
        )
        score = np.mean(scores)
        mlflow.log_metric('f1_macro', score)

        model_pipeline.fit(X_train, y_train)
        mlflow.sklearn.log_model(model_pipeline, 'model')

        last_run_id = run.info.run_id

#### RandomForestRegressor

In [7]:
from lib.sklearn.model import RegressionClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score

repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=99)

control_params = {
    'random_state': 99,
    'n_jobs': -1
}
var_params_options = [
    {'n_estimators': 50, 'criterion': 'squared_error'},
    {'n_estimators': 100, 'criterion': 'squared_error'},
    {'n_estimators': 150, 'criterion': 'squared_error'},
    {'n_estimators': 200, 'criterion': 'squared_error'},
    {'n_estimators': 50, 'criterion': 'friedman_mse'},
    {'n_estimators': 100, 'criterion': 'friedman_mse'},
    {'n_estimators': 150, 'criterion': 'friedman_mse'},
    {'n_estimators': 200, 'criterion': 'friedman_mse'},
]

for var_params in var_params_options:
    params = control_params.copy()
    params.update(var_params)

    predictor_pipeline = Pipeline([
        ('token_to_text', nlp.TokenToTextTransformer()),
        ('tfidf_vectorizer', TfidfVectorizer()),
        ('regressor', RegressionClassifier(RandomForestRegressor(**params), min=1, max=5))
    ])
    model_pipeline = Pipeline([
        ('preprocessor', preprocess_pipeline),
        ('predictor', predictor_pipeline)
    ])

    with mlflow.start_run() as run:
        params['model'] = 'RandomForestRegressor'
        mlflow.log_params(params)

        scores = cross_val_score(
            model_pipeline,
            X_train,
            y_train,
            cv=repeated_kfold,
            scoring='f1_macro',
            n_jobs=-1
        )
        score = np.mean(scores)
        mlflow.log_metric('f1_macro', score)

        model_pipeline.fit(X_train, y_train)
        mlflow.sklearn.log_model(model_pipeline, 'model')

        last_run_id = run.info.run_id

#### ExtraTreesClassifier

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RepeatedKFold, cross_val_score

repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=99)

control_params = {
    'random_state': 99,
    'n_jobs': -1
}
var_params_options = [
    {'n_estimators': 50, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 100, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 150, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 200, 'criterion': 'gini', 'class_weight': 'balanced'},
    {'n_estimators': 50, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 100, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 150, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 200, 'criterion': 'log_loss', 'class_weight': 'balanced'},
    {'n_estimators': 50, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 100, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 150, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 200, 'criterion': 'gini', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 50, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 100, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 150, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
    {'n_estimators': 200, 'criterion': 'log_loss', 'class_weight': 'balanced_subsample'},
]

for var_params in var_params_options:
    params = control_params.copy()
    params.update(var_params)

    predictor_pipeline = Pipeline([
        ('token_to_text', nlp.TokenToTextTransformer()),
        ('tfidf_vectorizer', TfidfVectorizer()),
        ('classifier', ExtraTreesClassifier(**params))
    ])
    model_pipeline = Pipeline([
        ('preprocessor', preprocess_pipeline),
        ('predictor', predictor_pipeline)
    ])

    with mlflow.start_run() as run:
        params['model'] = 'ExtraTreesClassifier'
        mlflow.log_params(params)

        scores = cross_val_score(
            model_pipeline,
            X_train,
            y_train,
            cv=repeated_kfold,
            scoring='f1_macro',
            n_jobs=-1
        )
        score = np.mean(scores)
        mlflow.log_metric('f1_macro', score)

        model_pipeline.fit(X_train, y_train)
        mlflow.sklearn.log_model(model_pipeline, 'model')

        last_run_id = run.info.run_id

#### MLPClassifier

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RepeatedKFold, cross_val_score

repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=99)

control_params = {
    'max_iter': 1000,
    'random_state': 99
}
var_params_options = [
    {'hidden_layer_sizes': (64,), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128,), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64,), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128,), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 16), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 16), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 16, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 32, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 16, 8), 'activation': 'logistic', 'solver': 'adam'},
]

for var_params in var_params_options:
    params = control_params.copy()
    params.update(var_params)

    predictor_pipeline = Pipeline([
        ('token_to_text', nlp.TokenToTextTransformer()),
        ('tfidf_vectorizer', TfidfVectorizer()),
        ('classifier', MLPClassifier(**params))
    ])
    model_pipeline = Pipeline([
        ('preprocessor', preprocess_pipeline),
        ('predictor', predictor_pipeline)
    ])

    with mlflow.start_run() as run:
        params['model'] = 'MLPClassifier'
        mlflow.log_params(params)

        scores = cross_val_score(
            model_pipeline,
            X_train,
            y_train,
            cv=repeated_kfold,
            scoring='f1_macro',
            n_jobs=-1
        )
        score = np.mean(scores)
        mlflow.log_metric('f1_macro', score)

        model_pipeline.fit(X_train, y_train)
        mlflow.sklearn.log_model(model_pipeline, 'model')

        last_run_id = run.info.run_id

#### MLPRegressor

In [18]:
from lib.sklearn.model import RegressionClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score

repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=99)

control_params = {
    'max_iter': 1000,
    'random_state': 99
}
var_params_options = [
    {'hidden_layer_sizes': (64,), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128,), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64,), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128,), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 16), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 16), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 16, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 32, 8), 'activation': 'logistic', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 16, 8), 'activation': 'logistic', 'solver': 'adam'},
]

for var_params in var_params_options:
    params = control_params.copy()
    params.update(var_params)

    predictor_pipeline = Pipeline([
        ('token_to_text', nlp.TokenToTextTransformer()),
        ('tfidf_vectorizer', TfidfVectorizer()),
        ('classifier', RegressionClassifier(MLPRegressor(**params), min=1, max=5))
    ])
    model_pipeline = Pipeline([
        ('preprocessor', preprocess_pipeline),
        ('predictor', predictor_pipeline)
    ])

    with mlflow.start_run() as run:
        params['model'] = 'MLPRegressor'
        mlflow.log_params(params)

        scores = cross_val_score(
            model_pipeline,
            X_train,
            y_train,
            cv=repeated_kfold,
            scoring='f1_macro',
            n_jobs=-1
        )
        score = np.mean(scores)
        mlflow.log_metric('f1_macro', score)

        model_pipeline.fit(X_train, y_train)
        mlflow.sklearn.log_model(model_pipeline, 'model')

        last_run_id = run.info.run_id



#### MLPRegressor -> SGDClassifier

In [4]:
from lib.sklearn.model import RegressionExtractor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RepeatedKFold, cross_val_score

repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=99)

control_params = {
    'max_iter': 1000,
    'random_state': 99
}
var_params_options = [
    {'hidden_layer_sizes': (64,), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128,), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (64, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 32, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 32, 8), 'activation': 'relu', 'solver': 'adam'},
    {'hidden_layer_sizes': (128, 64, 16, 16, 8), 'activation': 'relu', 'solver': 'adam'}
]

for var_params in var_params_options:
    params = control_params.copy()
    params.update(var_params)

    predictor_pipeline = Pipeline([
        ('token_to_text', nlp.TokenToTextTransformer()),
        ('tfidf_vectorizer', TfidfVectorizer()),
        ('regressor', RegressionExtractor(MLPRegressor(**params))),
        ('classifier', SGDClassifier()),
    ])
    model_pipeline = Pipeline([
        ('preprocessor', preprocess_pipeline),
        ('predictor', predictor_pipeline)
    ])

    with mlflow.start_run() as run:
        params['model'] = 'MLPRegressor-SGDClassifier'
        mlflow.log_params(params)

        scores = cross_val_score(
            model_pipeline,
            X_train,
            y_train,
            cv=repeated_kfold,
            scoring='f1_macro',
            n_jobs=-1
        )
        score = np.mean(scores)
        mlflow.log_metric('f1_macro', score)

        model_pipeline.fit(X_train, y_train)
        mlflow.sklearn.log_model(model_pipeline, 'model')

        last_run_id = run.info.run_id



#### Load preferred model

In [8]:
chosen_run_id = '3300ff72bca545dd8b38e2deef87b694'
model_pipeline = mlflow.sklearn.load_model(f'runs:/{chosen_run_id}/model')

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 120.18it/s]




### Inference test data

In [9]:
predictions = model_pipeline.predict(X_test)
submission = pd.DataFrame({'ID': np.arange(len(predictions)), 'LABEL': predictions})
display(submission)
submission.to_csv('test_submission.csv', index=False)

Unnamed: 0,ID,LABEL
0,0,5
1,1,5
2,2,5
3,3,1
4,4,1
...,...,...
495,495,3
496,496,5
497,497,4
498,498,1


#### Check predicted label distribution

In [10]:
prediction_counts = {
    label: count
    for label, count in zip(*np.unique(predictions, return_counts=True))
}
display(prediction_counts)

prediction_props = {
    label: count / len(predictions)
    for label, count in prediction_counts.items()
}
display(prediction_props)

{1: 84, 2: 12, 3: 18, 4: 54, 5: 332}

{1: 0.168, 2: 0.024, 3: 0.036, 4: 0.108, 5: 0.664}