In [5]:
from text_loader.loader import DataLoader
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib
import mlflow
import datetime

### Pre-processing

In [10]:
data_loader = DataLoader(filepath='../data/Tweets.csv')
processed_features, labels = data_loader.preprocess()

labels = labels - 1

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.3)

### Train & evaluate

In [38]:
mlflow.set_tracking_uri('file:///Users/cullywest/git/MLEng-politicalparties-python/data') 
mlflow.set_experiment("MLflow")
with mlflow.start_run():
        model = XGBClassifier(booster='gbtree', max_depth=6, max_leaves=0)
        model.fit(X_train, y_train)

        train_accuracy = accuracy_score(y_train, model.predict(X_train))
        accuracy = accuracy_score(y_test, model.predict(X_test))

        print("  train_accuracy: %s" % train_accuracy)
        print("  accuracy: %s" % accuracy)

        params = model.get_xgb_params()

        for param in params.keys():
            mlflow.log_param(param, params[param])

        mlflow.log_metric("train_acc", train_accuracy)
        mlflow.log_metric("test_acc", accuracy)

        # Log the model with a timestamp
        timestamp = datetime.datetime.now()
        model_name = f"model_{timestamp}"
        mlflow.sklearn.log_model(model, artifact_path="test_path", registered_model_name=model_name)

  train_accuracy: 0.737252569313638
  accuracy: 0.6972897952889472


Successfully registered model 'model_2024-03-19 11:39:00.483712'.
Created version '1' of model 'model_2024-03-19 11:39:00.483712'.


In [11]:
print(labels)

[1 0 0 ... 1 1 1]


In [13]:
import mlflow.pyfunc

In [15]:
print(timestamp)

2024-03-15 11:25:28.997230


In [22]:
mlflow.get_tracking_uri()

'file:///Users/cullywest/git/MLEng-politicalparties-python/data'

In [35]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/1")

In [19]:
from pprint import pprint
from mlflow import MlflowClient
client = MlflowClient()

In [49]:
for rm in client.search_registered_models():
    pprint(dict(rm), indent=4)

{   'aliases': {},
    'creation_timestamp': 1710524901210,
    'description': None,
    'last_updated_timestamp': 1710524901212,
    'latest_versions': [   <ModelVersion: aliases=[], creation_timestamp=1710524901212, current_stage='None', description=None, last_updated_timestamp=1710524901212, name='model_2024-03-15 11:48:19.976403', run_id='362d7c5c4af8498f974292af5e578d00', run_link=None, source='file:///Users/cullywest/git/MLEng-politicalparties-python/data/918993356073278198/362d7c5c4af8498f974292af5e578d00/artifacts/test_path', status='READY', status_message=None, tags={}, user_id=None, version=1>],
    'name': 'model_2024-03-15 11:48:19.976403',
    'tags': {}}
{   'aliases': {},
    'creation_timestamp': 1710869941646,
    'description': None,
    'last_updated_timestamp': 1710869941648,
    'latest_versions': [   <ModelVersion: aliases=[], creation_timestamp=1710869941648, current_stage='None', description=None, last_updated_timestamp=1710869941648, name='model_2024-03-19 11:3

In [37]:
model.predict(X_test)

array([0, 1, 1, ..., 1, 1, 0])

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
with mlflow.start_run() as r:
    model = TfidfVectorizer(max_features=2500, min_df=1, max_df=0.8)
    model.fit(data_loader.data.Tweet)
    
    timestamp = datetime.datetime.now().strftime("%Y_%m_%dT%H:%M:%S")
    model_name = f"tfidf_vectorizer_{timestamp}"
    
    mlflow.sklearn.log_model(model, artifact_path="vectorizer", registered_model_name=model_name)

Successfully registered model 'tfidf_vectorizer_2024_03_20T10:49:15'.
Created version '1' of model 'tfidf_vectorizer_2024_03_20T10:49:15'.


In [45]:
timestamp = datetime.datetime.now().strftime("%Y_%m_%dT%H:%M:%S")
print(timestamp)

2024_03_20T10:48:13


In [48]:
model_name

'tfidf_vectorizer_2024_03_20T10:49:15'