In [1]:
from datasets import load_dataset

spam_detection_dataset = load_dataset("Deysi/spam-detection-dataset")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (C:/Users/yshen/.cache/huggingface/datasets/Deysi___parquet/Deysi--spam-detection-dataset-393b2a235e6c9981/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████| 2/2 [00:00<00:00, 666.61it/s]


In [2]:
spam_detection_dataset.set_format(type='pandas')

In [3]:
train_df = spam_detection_dataset['train'][:]
test_df = spam_detection_dataset['test'][:]

In [4]:
train_df.label.value_counts(normalize=True)

label
spam        0.504587
not_spam    0.495413
Name: proportion, dtype: float64

In [5]:
test_df.label.value_counts(normalize=True)

label
spam        0.504587
not_spam    0.495413
Name: proportion, dtype: float64

In [6]:
# Take only a small sample to reduce data pre-processing and training time
train_df = train_df.sample(1500, random_state=10)
test_df = test_df.sample(500, random_state=10)

In [7]:
from joblib import dump

train_df.to_csv('dataset/train_df.csv', index=False)
test_df.to_csv('dataset/test_df.csv', index=False)

In [8]:
from sentence_transformers import SentenceTransformer

# Load the model
sentence_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

In [9]:
train_embeddings = sentence_model.encode(train_df['text'].values, show_progress_bar=True, batch_size=32)

Batches: 100%|██████████| 47/47 [00:08<00:00,  5.64it/s]


In [10]:
test_embeddings = sentence_model.encode(test_df['text'].values, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches: 100%|██████████| 16/16 [00:02<00:00,  6.51it/s]


In [11]:
dump(train_embeddings, 'embeddings/train_embeddings.joblib')
dump(test_embeddings, 'embeddings/test_embeddings.joblib')

['embeddings/test_embeddings.joblib']

In [12]:
# import randomforestclassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

In [13]:
clf = RandomForestClassifier()

In [14]:
clf.fit(train_embeddings, train_df['label'])

In [15]:
predict = clf.predict(test_embeddings)

In [16]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_df['label'], predict)

array([[246,   2],
       [  0, 252]], dtype=int64)

In [17]:
from sklearn.metrics import classification_report

print(classification_report(test_df['label'], predict))

              precision    recall  f1-score   support

    not_spam       1.00      0.99      1.00       248
        spam       0.99      1.00      1.00       252

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500



In [18]:
from mlflow.tracking import MlflowClient


MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
MLFLOW_EXPERIMENT_NAME = "spam-detection-experiment"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# Create a new experiment
client.create_experiment(name=MLFLOW_EXPERIMENT_NAME)

2023/07/22 18:02:45 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/07/22 18:02:45 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

'1'

In [19]:
import optuna
import mlflow
from optuna.integration.mlflow import MLflowCallback

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

<Experiment: artifact_location='file:///e:/Projects/Git/mlops_project/mlruns/1', creation_time=1690020166075, experiment_id='1', last_update_time=1690020166075, lifecycle_stage='active', name='spam-detection-experiment', tags={}>

In [20]:
from sklearn.metrics import accuracy_score

def objective(trial):
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 5, 100, log=True)
    clf = RandomForestClassifier(max_depth=rf_max_depth, n_estimators=rf_n_estimators)
    clf.fit(train_embeddings, train_df['label'])

    predictions = clf.predict(test_embeddings)
    accuracy = accuracy_score(test_df['label'], predictions)
    return accuracy

mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name='accuracy',
    create_experiment=False,
)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, callbacks=[mlflc])

  mlflc = MLflowCallback(
[I 2023-07-22 18:02:46,213] A new study created in memory with name: no-name-951a08eb-2e4f-4569-9d91-7c1e60b9fd5e
[I 2023-07-22 18:02:46,397] Trial 0 finished with value: 0.98 and parameters: {'rf_max_depth': 6, 'rf_n_estimators': 7}. Best is trial 0 with value: 0.98.
[I 2023-07-22 18:02:47,150] Trial 1 finished with value: 0.988 and parameters: {'rf_max_depth': 2, 'rf_n_estimators': 61}. Best is trial 1 with value: 0.988.
[I 2023-07-22 18:02:47,536] Trial 2 finished with value: 0.98 and parameters: {'rf_max_depth': 23, 'rf_n_estimators': 11}. Best is trial 1 with value: 0.988.
[I 2023-07-22 18:02:48,610] Trial 3 finished with value: 0.996 and parameters: {'rf_max_depth': 20, 'rf_n_estimators': 32}. Best is trial 3 with value: 0.996.
[I 2023-07-22 18:02:51,350] Trial 4 finished with value: 1.0 and parameters: {'rf_max_depth': 26, 'rf_n_estimators': 85}. Best is trial 4 with value: 1.0.
[I 2023-07-22 18:02:51,803] Trial 5 finished with value: 0.992 and paramete

In [21]:
study.best_value

1.0

In [22]:
# Search for runs in the experiment
# Get based on the best trial value with the lowest n_estimators
from mlflow.entities import ViewType

spam_detection_experiment=dict(mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME))
experiment_id=spam_detection_experiment['experiment_id']

best_run = client.search_runs( 
    experiment_ids=experiment_id,
    filter_string=f'metrics.accuracy = {study.best_value}',
    run_view_type= ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=['parameters.rf_n_estimators ASC']
)[0]


In [23]:
best_max_depth = int(best_run.data.params['rf_max_depth'])
best_n_estimators = int(best_run.data.params['rf_n_estimators'])

mlflow.log_params({'rf_max_depth': best_max_depth, 'rf_n_estimators': best_n_estimators})

best_clf = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth)
best_clf.fit(train_embeddings, train_df['label'])

best_predictions = best_clf.predict(test_embeddings)
accuracy = accuracy_score(test_df['label'], best_predictions)
mlflow.log_metric("accuracy", accuracy)

dump(best_clf, 'models/best_clf.joblib')
mlflow.sklearn.log_model(best_clf, artifact_path="models", registered_model_name='spam-detector')

Successfully registered model 'best-spam-detector'.
2023/07/22 18:06:05 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: best-spam-detector, version 1
Created version '1' of model 'best-spam-detector'.


<mlflow.models.model.ModelInfo at 0x1f412e780d0>

In [None]:
t