In [2]:
import mlflow
import mlflow.sklearn
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pickle

# Set the tracking URI to your MLflow server
mlflow.set_tracking_uri("http://127.0.0.1:8082")
# Set the experiment
mlflow.set_experiment("/model-registry")


<Experiment: artifact_location='mlflow-artifacts:/466190818076135293', creation_time=1720608017936, experiment_id='466190818076135293', last_update_time=1720608017936, lifecycle_stage='active', name='/model-registry', tags={}>

# Model Building and experiment

In [3]:
def preprocess_text(text):
    text = re.sub('<br />', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

def preprocess_data(input_file):
    data = pd.read_csv(input_file)
    data['cleaned_review'] = data['review'].apply(preprocess_text)
    vectorizer = TfidfVectorizer(max_features=10000)
    X = vectorizer.fit_transform(data['cleaned_review'])
    y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    with open('models/tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    return X, y

# Preprocess data
X, y = preprocess_data('IMDB Dataset.csv')


In [4]:
with mlflow.start_run() as run:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    params = {"max_iter": 100, "random_state": 42}
    model = LogisticRegression(**params)
    model.fit(X_train, y_train)
    
    # Log parameters
    mlflow.log_params(params)
    
    # Make predictions and log metrics
    y_pred = model.predict(X_test)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }
    mlflow.log_metrics(metrics)
    
    # Log the model and register it
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="logistic-regression-model",
        input_example=X_train,
        registered_model_name="logistic-regression-sentiment-model"
    )

    # Save test data for later use
    with open('models/X_test.pkl', 'wb') as f:
        pickle.dump(X_test, f)
    with open('models/y_test.pkl', 'wb') as f:
        pickle.dump(y_test, f)
    
    print("Model training and logging completed.")
    print("Run ID:", run.info.run_id)


Successfully registered model 'logistic-regression-sentiment-model'.
2024/07/25 01:36:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic-regression-sentiment-model, version 1


Model training and logging completed.
Run ID: 5a267d3d081e462d8831d8b1a9d20dc9


Created version '1' of model 'logistic-regression-sentiment-model'.


In [5]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)
    return metrics

# Load test data and model
with open('models/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('models/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)
model = mlflow.sklearn.load_model("models:/logistic-regression-sentiment-model/1")

# Evaluate the model
metrics = evaluate_model(model, X_test, y_test)
print("Model evaluation completed:", metrics)


Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Model evaluation completed: {'accuracy': 0.8994, 'precision': 0.8913254414903939, 'recall': 0.9114903750744195, 'f1_score': 0.9012951334379906, 'roc_auc': 0.9641062962270623}
