In [None]:
!pip install mlflow boto3 awscli

Collecting mlflow
  Downloading mlflow-3.1.4-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.39.13-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli
  Downloading awscli-1.41.13-py3-none-any.whl.metadata (11 kB)
Collecting mlflow-skinny==3.1.4 (from mlflow)
  Downloading mlflow_skinny-3.1.4-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading databricks_sdk-0.60.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.4->mlflow)
  Download

In [None]:
from google.colab import userdata
import os

# Retrieve AWS credentials from Colab Secrets
aws_access_key = userdata.get('AWS_ACCESS_KEY_ID')
aws_secret_key = userdata.get('AWS_SECRET_ACCESS_KEY')
mlflow_uri = userdata.get('MLFLOW_TRACKING_URI')

# Set as environment variables (for boto3 and AWS CLI)
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_key

In [None]:
import mlflow
mlflow.set_tracking_uri(mlflow_uri)
mlflow.set_experiment("Exp 3 - TfIdf_MaxFeatures")

2025/07/24 23:21:16 INFO mlflow.tracking.fluent: Experiment with name 'Exp 3 - TfIdf_MaxFeatures' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://sumanthdatahub/mlflow-artifacts/925232728800309726', creation_time=1753399276864, experiment_id='925232728800309726', last_update_time=1753399276864, lifecycle_stage='active', name='Exp 3 - TfIdf_MaxFeatures', tags={}>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read Preprocesssing Data
df = pd.read_csv('/content/reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36710, 6)

In [None]:
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
  # Vectorization
  if vectorizer_type == 'BoW':
    vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
  else:
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

  X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42)

  X_train_vectorized = vectorizer.fit_transform(X_train)
  X_test_vectorized = vectorizer.transform(X_test)

  # Define and Train RF model
  with mlflow.start_run() as run:
    # Set Tags
    tags = {
        "mlflow.runName": f"{vectorizer_type}_{vectorizer_name}_RandomForest",
        "experiment_type": "feature_engineering",
        "model_type": "RandomForestClassifier",
        "description": f"RandomForest with {vectorizer_name}, ngram_range={ngram_range}, max_features={vectorizer_max_features}",

    }
    mlflow.set_tags(tags)

    # Set Params
    n_estimators = 200
    max_depth = 15

    params = {
        "vectorizer_type": vectorizer_type,
        "ngram_range": ngram_range,
        "vectorizer_max_features": vectorizer_max_features,
        "n_estimators": n_estimators,
        "max_depth": max_depth
    }
    mlflow.log_params(params)

    # Initialize and Train the Model
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    rf_model.fit(X_train_vectorized, y_train)

    # Make Predictions and Log Metrics
    y_pred = rf_model.predict(X_test_vectorized)

    # Log Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    # Log Classification Report
    classification_metrics = classification_report(y_test, y_pred, output_dict=True)
    for class_label, metrics in classification_metrics.items():
        if isinstance(metrics, dict):
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(f"{class_label}_{metric_name}", metric_value)

    # Log Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix: TF-IDF Trigrams, max_featires = {vectorizer_max_features}')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()

    # Log the Model
    mlflow.sklearn.log_model(rf_model, f"random_forest_model_TifDif_Trigram_{vectorizer_max_features}")


In [None]:
# Run Experiments for BoW and TF-IDF with different n-grams
ngram_ranges = [(1, 3)] # trigrams
max_features_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000] # Example different max feature sizes
vectorizer_types = ['TfIdf']

for max_features in max_features_values:
  run_experiment('Tfidf', (1, 3), max_features, 'Tfidf')



🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/e23063b785604b9ea999541eddeebecc
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/e85f8ff22eee410387f5a75f8fc59ee5
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/8f64d45255cc43a080d282087aacf8ef
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/b38f038999094ad2a836a5748be41edf
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/a70c2ffebd26452393c95849dfe481da
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/15e28b2f511f4c0ab76a6e8c01b249ed
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/b79a68b938b84bec96803c6eaf83830d
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/be176dfbe17648bf85401f68a3a99600
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/e3cd02a034934ce182b0424b0a3235c9
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726




🏃 View run Tfidf_Tfidf_RandomForest at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726/runs/5541e5b678a34ee38e0936faf3055519
🧪 View experiment at: http://ec2-54-87-129-244.compute-1.amazonaws.com:5000/#/experiments/925232728800309726


Exp 2: Choose TF IDF, and Trigram |
Exp 3: Choose less features like 1000 |
Exp 4: Fix imbalance for different Features
