In [2]:
# Import required libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from pycaret.classification import *
import mlflow
import mlflow.sklearn

# Load 20 newsgroups dataset
categories = ['rec.autos', 'sci.electronics', 'comp.graphics', 'rec.sport.hockey']  # Example categories
data = fetch_20newsgroups(subset='all', categories=categories)

# Display basic dataset information
print(f"Number of texts: {len(data.data)}")
print(f"Number of categories: {len(data.target_names)}")

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Initialize TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000)

# Transform train and test datasets
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# Create a dataframe for PyCaret
train_data = pd.DataFrame(X_train_tfidf)
train_data['target'] = y_train

# Initialize the PyCaret setup
clf_setup = setup(data=train_data, target='target', session_id=123, use_gpu=False, log_experiment=True, experiment_name="text_classification", log_plots=True, log_data=True)

# Compare models and select the best one
best_model = compare_models()

# Train the best model on the entire dataset
final_model = finalize_model(best_model)

# Evaluate the model on the test set
test_data = pd.DataFrame(X_test_tfidf)
test_data['target'] = y_test

predictions = predict_model(final_model, data=test_data)

# Display results
print(predictions[['Label', 'target']].head())

# Plot feature importance
plot_model(final_model, plot='feature')

# Log the model and results in MLflow
with mlflow.start_run():
    # Log the best model
    mlflow.sklearn.log_model(final_model, "best_model")
    
    # Extract metrics
    metrics = pull()
    mlflow.log_metrics({
        "Accuracy": metrics["Accuracy"], 
        "AUC": metrics["AUC"], 
        "F1": metrics["F1"], 
        "Recall": metrics["Recall"], 
        "Precision": metrics["Prec."]
    })
    
    # Log the feature importance plot
    feature_plot_path = plot_model(final_model, plot='feature', save=True)
    mlflow.log_artifact(feature_plot_path)

    # Log parameters
    mlflow.log_param("TF-IDF max features", tfidf.max_features)

    print("Model and metrics logged to MLflow.")



Number of texts: 3946
Number of categories: 4


Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(3156, 1001)"
4,Transformed data shape,"(3156, 1001)"
5,Transformed train set shape,"(2209, 1001)"
6,Transformed test set shape,"(947, 1001)"
7,Numeric features,1000
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9158,0.9856,0.9158,0.9176,0.9157,0.8877,0.8883,0.243
ridge,Ridge Classifier,0.9154,0.0,0.9154,0.9165,0.9156,0.8871,0.8874,0.096
svm,SVM - Linear Kernel,0.9113,0.0,0.9113,0.9125,0.9114,0.8817,0.882,0.11
lr,Logistic Regression,0.9095,0.0,0.9095,0.9107,0.9097,0.8793,0.8795,0.643
lightgbm,Light Gradient Boosting Machine,0.9031,0.9854,0.9031,0.9057,0.9036,0.8708,0.8713,2.2
catboost,CatBoost Classifier,0.8977,0.9831,0.8977,0.9001,0.898,0.8636,0.8642,94.057
rf,Random Forest Classifier,0.8805,0.9777,0.8805,0.8829,0.8807,0.8406,0.8413,0.232
gbc,Gradient Boosting Classifier,0.8714,0.0,0.8714,0.8794,0.8731,0.8286,0.8302,3.866
lda,Linear Discriminant Analysis,0.8624,0.0,0.8624,0.865,0.863,0.8165,0.817,0.206
nb,Naive Bayes,0.8615,0.941,0.8615,0.8621,0.86,0.8153,0.8165,0.088




Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9177,0.9835,0.9177,0.9192,0.9174,0.8903,0.891


KeyError: "['Label'] not in index"