In [5]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
from mlflow.models.signature import infer_signature

In [6]:
#setting tracking 
mlflow.set_tracking_uri("http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/")

In [7]:
from mlflow.tracking import MlflowClient 
client = MlflowClient()

# Get all experiments (returns List[Experiment])
experiments_id = client.search_experiments()

for exp in experiments_id:
    print(f"Experiment name :{exp.name} and id:{exp.experiment_id}")


Experiment name :baseline_ML_model and id:408458959843314322
Experiment name :Navie_bayes_experiment and id:639483181163671325
Experiment name :Default and id:0


In [5]:
mlflow.set_experiment(experiment_name="baseline_ML_model")

2025/09/09 13:46:56 INFO mlflow.tracking.fluent: Experiment with name 'baseline_ML_model' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://s3bucmlflow/408458959843314322', creation_time=1757405817706, experiment_id='408458959843314322', last_update_time=1757405817706, lifecycle_stage='active', name='baseline_ML_model', tags={}>

In [8]:
df = pd.read_csv("/root/mlops_projects/FinancialSentiment_prediction/Datasets/Financial_data.csv")
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,neutral
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",neutral
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,neutral
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",negative


In [9]:
df.shape

(5842, 2)

In [10]:
df.duplicated().sum()

np.int64(520)

In [11]:
df.drop_duplicates(keep='first',inplace=True)

In [12]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet

In [13]:
def preprocess_text(text):
    text = text.lower()
    # Retain numbers and basic punctuation, remove only special characters
    text = re.sub(r'http\S+|[^\w\s.]', '', text)  # Keep numbers, commas, periods
    text = re.sub(r'\s+',' ',text) #collapsing multiple spaces to one space
    #text = text.strip() #removes white spaces
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    stop_words = set(stopwords.words('english')) - {'not', 'no', 'never', 'very', 'bullish', 'bearish', 'buy', 'sell', 'strong', 'weak', 'profit', 'loss', 'growth'}
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [14]:
df["Sentence"] = df["Sentence"].apply(preprocess_text)

In [15]:
x = df["Sentence"]
y = df["Sentiment"]

In [16]:
def label_con(data):
    res = data.map(lambda x: 1 if x == "positive" else (0 if x == "neutral" else -1))
    return res

In [17]:
y = label_con(df["Sentiment"])

In [18]:
print(y)

0       0
1      -1
2       0
3       0
4       0
       ..
5835    0
5836    0
5838    0
5839    0
5841    0
Name: Sentiment, Length: 5322, dtype: int64


In [19]:
y.shape

(5322,)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42)

In [21]:
y_train.to_numpy().flatten()

array([1, 0, 0, ..., 0, 0, 1], shape=(3991,))

In [22]:
experiment_name = "baseline_ML_model"
try:
    experiment = mlflow.get_experiment_by_name(name=experiment_name)
    experiment_Id = experiment.experiment_id
    print(f"experiment_id for {experiment_name} is :{experiment_Id}")
    
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    raise

experiment_id for baseline_ML_model is :408458959843314322


In [23]:
def train_and_log_model(model, model_name,experiment_id, vectorizer_type):
    with mlflow.start_run(experiment_id=experiment_id, run_name=f"{model_name}-{vectorizer_type}"):

        # Vectorization
        if vectorizer_type == "bow":
            vectorizer = CountVectorizer()
        elif vectorizer_type == "tfidf":
            vectorizer = TfidfVectorizer()
        else:
            raise ValueError("Invalid vectorizer type")

        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        # Fit model
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Manual logging
        mlflow.log_param("model", model_name)
        mlflow.log_param("vectorizer", vectorizer_type)
        mlflow.log_param("vectorizer_vocab_size", len(vectorizer.vocabulary_))
        if hasattr(model, "get_params"):
            for k, v in model.get_params().items():
                mlflow.log_param(k, v)

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        # Signature
        X_sample = X_train_vec[:2]
        y_sample = model.predict(X_sample)
        signature = infer_signature(X_sample, y_sample)

        # Save model and vectorizer
        mlflow.sklearn.log_model(model, name="model", signature=signature,input_example=X_sample)
        mlflow.sklearn.log_model(vectorizer, name="vectorizer")

        print(f"{model_name} ({vectorizer_type}) — Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")


In [24]:
train_and_log_model(LogisticRegression(max_iter=500),"LogisticRegression",experiment_Id,"tfidf")



LogisticRegression (tfidf) — Accuracy: 0.8062, F1 Score: 0.7817
🏃 View run LogisticRegression-tfidf at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/eb8a271d564c47faa32a81eada455677
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322


In [25]:
train_and_log_model(LogisticRegression(max_iter=500), "LogisticRegression",experiment_Id,"bow")



LogisticRegression (bow) — Accuracy: 0.8535, F1 Score: 0.8449
🏃 View run LogisticRegression-bow at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/4ed1f70b9ab44d888afefb7efe2b6c6c
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322


In [23]:
def train_and_log_model_with_tuning(model_class, model_name, vectorizer_type,
                                    X_train, X_test, y_train, y_test,
                                    experiment_id, param_distributions,
                                    n_iter_search=10, cv_folds=3, scoring_metric='f1_weighted'):
    """
    Trains and logs a model after performing RandomizedSearchCV for hyperparameter tuning.
    The best model from the search is logged.
    """
    run_name_prefix = f"{model_name}-{vectorizer_type}-Tuning"

    X_train = X_train.astype(str).fillna('')
    X_test = X_test.astype(str).fillna('')

    if model_class == XGBClassifier: #as classifier not accept negative labels
        y_train = y_train.map({0:0,1:1,-1:2})
        y_test  = y_test.map({0:0,1:1,-1:2})

    # Define the base vectorizer based on type
    if vectorizer_type == "bow":
        vectorizer_instance = CountVectorizer()
    elif vectorizer_type == "tfidf":
        vectorizer_instance = TfidfVectorizer()
    else:
        raise ValueError("Invalid vectorizer type")

    # Instantiate the classifier
    if model_class == XGBClassifier:
        classifier_instance = XGBClassifier(
            #use_label_encoder=False,  # Recommended to suppress future warnings
            eval_metric='mlogloss',   # Multi-class logloss for multi-class classification
            objective='multi:softmax', # For direct class predictions
            num_class=len(y.unique()), # Important: Set number of classes
            random_state=42
        )
    else:
        classifier_instance = model_class(random_state=42) # Setting random_state for reproducibility

    # Create the pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer_instance),
        ('classifier', classifier_instance)
    ])

    # Initialize RandomizedSearchCV
    # We pass the pipeline here, and the param_distributions should be prefixed
    # e.g., 'classifier__n_estimators'
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=n_iter_search,
        cv=cv_folds,
        scoring=scoring_metric,
        random_state=42, # For reproducible search results
        n_jobs=4, # Use all available CPU cores for parallel processing
        verbose=1
    )

    print(f"\nStarting RandomizedSearchCV for {model_name} with {vectorizer_type}...")
    random_search.fit(X_train, y_train)

    # Get the best model found by the search
    best_pipeline = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_

    # Make predictions with the best model
    y_pred = best_pipeline.predict(X_test)

    # Calculate final metrics on the test set
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Start MLflow run for logging the BEST model from the search
    with mlflow.start_run(experiment_id=experiment_id, run_name=f"{run_name_prefix}-BestModel"):

        mlflow.log_param("model_name", model_name)
        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("tuning_method", "RandomizedSearchCV")
        mlflow.log_param("n_iter_search", n_iter_search)
        mlflow.log_param("cv_folds", cv_folds)
        mlflow.log_param("scoring_metric", scoring_metric)

        # Log all best parameters found by the search
        for k, v in best_params.items():
            mlflow.log_param(k, v)

        # Log metrics (best CV score and test set performance)
        mlflow.log_metric(f"best_cv_{scoring_metric}", best_score)
        mlflow.log_metric("test_accuracy", acc)
        mlflow.log_metric("test_f1_score_weighted", f1)

         # Directly log the full classification report as a text file
        full_report_str = classification_report(y_test, y_pred)
        mlflow.log_text(full_report_str, "classification_report.txt")

        print("\nClassification Report:\n", full_report_str)



        # Log vectorizer specific parameters (from the best pipeline's vectorizer)
        best_vectorizer = best_pipeline.named_steps['vectorizer']
        mlflow.log_param("vectorizer_vocab_size", len(best_vectorizer.vocabulary_))
        if hasattr(best_vectorizer, "get_params"):
            for k, v in best_vectorizer.get_params().items():
                # Filter out parameters that aren't typically interesting to log at top level
                if not (k.startswith("input") or k.startswith("dtype")):
                    mlflow.log_param(f"vectorizer_{k}", v)


        # Log the entire best pipeline
        # Signature for the entire pipeline: Input is raw text, Output is prediction
        pipeline_input_example = X_train[:2].astype('str').fillna('').to_list() # Ensure input example is raw text
        pipeline_output_example = best_pipeline.predict(pipeline_input_example)
        pipeline_signature = infer_signature(pipeline_input_example, pipeline_output_example)

        mlflow.sklearn.log_model(
            sk_model=best_pipeline,
            name= "text_classification_pipeline",
            signature=pipeline_signature,
            input_example=pipeline_input_example
        )

        print(f"\n--- Best Model from Tuning ---")
        print(f"Model: {model_name}, Vectorizer: {vectorizer_type}")
        print(f"Best CV Score ({scoring_metric}): {best_score:.4f}")
        print(f"Test Accuracy: {acc:.4f}, Test F1 Score (weighted): {f1:.4f}")
        print(f"Best Parameters: {best_params}")
        print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")

In [27]:
pipeline_example = X_train[:2].astype('str').fillna('').to_list()

In [28]:
type(pipeline_example)

list

In [29]:
# Random Forest parameters 
rf_param_distributions_bow = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', 0.5, 0.7],
    'classifier__class_weight': [None, 'balanced']
}

rf_param_distributions_tfidf = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__use_idf': [True, False],
    'classifier__n_estimators': [100, 250, 400],
    'classifier__max_depth': [15, 25, None],
    'classifier__min_samples_split': [2, 5, 8],
    'classifier__min_samples_leaf': [1, 2, 3],
    'classifier__max_features': ['sqrt', 0.6, 0.8],
    'classifier__class_weight': [None, 'balanced']
}

In [30]:
lr_param_distributions_tfidf = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__use_idf': [True, False], # Important for TF-IDF
    'classifier__C': np.logspace(-3, 2, 6), # Example: 0.001, 0.01, 0.1, 1, 10, 100
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga'], # Solvers supporting l1/l2
    'classifier__max_iter': [100, 200, 500], # Increase if convergence warnings appear
    'classifier__class_weight': [None, 'balanced']
}

lr_param_distributions_bow = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__C': np.logspace(-4, 3, 8), # Broader range for C
    'classifier__penalty': ['l2'], # Common choice for Logistic Regression
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag'], # Solvers supporting only l2
    'classifier__max_iter': [100, 300, 1000],
    'classifier__class_weight': [None, 'balanced']
}


In [25]:
xgb_param_distributions_bow = {
    'vectorizer__max_df': np.arange(0.7, 0.99, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.7, 0.9],
    'classifier__colsample_bytree': [0.7, 0.9],
    'classifier__gamma': [0, 0.1, 0.2] # Min loss reduction to make a split
}

xgb_param_distributions_tf_idf = {
    'vectorizer__max_df': np.arange(0.7, 0.99, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__use_idf': [True, False], # Important for TF-IDF
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.7, 0.9],
    'classifier__colsample_bytree': [0.7, 0.9],
    'classifier__gamma': [0, 0.1, 0.2] # Min loss reduction to make a split
}


In [32]:
print("--- Running Random Forest Tuning ---")
train_and_log_model_with_tuning(
    model_class=RandomForestClassifier,
    model_name="RandomForestClassifier",
    vectorizer_type="bow",
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=rf_param_distributions_bow,
    n_iter_search=5, 
    cv_folds=2,      
    scoring_metric='f1_weighted'
)

--- Running Random Forest Tuning ---

Starting RandomizedSearchCV for RandomForestClassifier with bow...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Classification Report:
               precision    recall  f1-score   support

          -1       0.88      0.37      0.52        94
           0       0.84      0.97      0.90       871
           1       0.94      0.73      0.82       366

    accuracy                           0.86      1331
   macro avg       0.88      0.69      0.75      1331
weighted avg       0.87      0.86      0.85      1331



  "inputs": [
    "nq got hit hard lower look li.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: RandomForestClassifier, Vectorizer: bow
Best CV Score (f1_weighted): 0.8349
Test Accuracy: 0.8640, Test F1 Score (weighted): 0.8536
Best Parameters: {'vectorizer__ngram_range': (1, 1), 'vectorizer__min_df': 1, 'vectorizer__max_df': np.float64(0.9500000000000002), 'classifier__n_estimators': 300, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 0.5, 'classifier__max_depth': 20, 'classifier__class_weight': None}
MLflow Run ID: 8f0e625a42f04c1db70b05d169efd01d
🏃 View run RandomForestClassifier-bow-Tuning-BestModel at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/8f0e625a42f04c1db70b05d169efd01d
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322


In [34]:
train_and_log_model_with_tuning(
    model_class=RandomForestClassifier,
    model_name="RandomForestClassifier",
    vectorizer_type="tfidf",
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=rf_param_distributions_tfidf,
    n_iter_search=5,
    cv_folds=2,
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for RandomForestClassifier with tfidf...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Classification Report:
               precision    recall  f1-score   support

          -1       0.90      0.37      0.53        94
           0       0.84      0.97      0.90       871
           1       0.93      0.73      0.82       366

    accuracy                           0.86      1331
   macro avg       0.89      0.69      0.75      1331
weighted avg       0.87      0.86      0.85      1331



  "inputs": [
    "nq got hit hard lower look li.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: RandomForestClassifier, Vectorizer: tfidf
Best CV Score (f1_weighted): 0.8369
Test Accuracy: 0.8640, Test F1 Score (weighted): 0.8536
Best Parameters: {'vectorizer__use_idf': True, 'vectorizer__ngram_range': (1, 1), 'vectorizer__min_df': 5, 'vectorizer__max_df': np.float64(0.8), 'classifier__n_estimators': 400, 'classifier__min_samples_split': 8, 'classifier__min_samples_leaf': 3, 'classifier__max_features': 0.6, 'classifier__max_depth': 25, 'classifier__class_weight': None}
MLflow Run ID: a4bef01702df4df596617229bfe0cd58
🏃 View run RandomForestClassifier-tfidf-Tuning-BestModel at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/a4bef01702df4df596617229bfe0cd58
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322


In [35]:
train_and_log_model_with_tuning(
    model_class=LogisticRegression,
    model_name="LogisticRegression",
    vectorizer_type="bow",
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=lr_param_distributions_bow,
    n_iter_search=5,
    cv_folds=2,
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for LogisticRegression with bow...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Classification Report:
               precision    recall  f1-score   support

          -1       0.76      0.37      0.50        94
           0       0.83      0.96      0.89       871
           1       0.91      0.69      0.78       366

    accuracy                           0.84      1331
   macro avg       0.83      0.67      0.72      1331
weighted avg       0.85      0.84      0.83      1331



  "inputs": [
    "nq got hit hard lower look li.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: LogisticRegression, Vectorizer: bow
Best CV Score (f1_weighted): 0.7786
Test Accuracy: 0.8430, Test F1 Score (weighted): 0.8323
Best Parameters: {'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 1, 'vectorizer__max_df': np.float64(0.7), 'classifier__solver': 'newton-cg', 'classifier__penalty': 'l2', 'classifier__max_iter': 100, 'classifier__class_weight': None, 'classifier__C': np.float64(10.0)}
MLflow Run ID: e73482521b374aeaa659ff657c201ed8
🏃 View run LogisticRegression-bow-Tuning-BestModel at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/e73482521b374aeaa659ff657c201ed8
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322


In [36]:
train_and_log_model_with_tuning(
    model_class=LogisticRegression,
    model_name="LogisticRegression",
    vectorizer_type="tfidf",
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=lr_param_distributions_tfidf,
    n_iter_search=5,
    cv_folds=2,
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for LogisticRegression with tfidf...
Fitting 2 folds for each of 5 candidates, totalling 10 fits





Classification Report:
               precision    recall  f1-score   support

          -1       0.69      0.43      0.53        94
           0       0.86      0.91      0.88       871
           1       0.80      0.77      0.78       366

    accuracy                           0.84      1331
   macro avg       0.78      0.70      0.73      1331
weighted avg       0.83      0.84      0.83      1331



  "inputs": [
    "nq got hit hard lower look li.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: LogisticRegression, Vectorizer: tfidf
Best CV Score (f1_weighted): 0.7882
Test Accuracy: 0.8355, Test F1 Score (weighted): 0.8301
Best Parameters: {'vectorizer__use_idf': True, 'vectorizer__ngram_range': (1, 1), 'vectorizer__min_df': 1, 'vectorizer__max_df': np.float64(0.8500000000000001), 'classifier__solver': 'liblinear', 'classifier__penalty': 'l2', 'classifier__max_iter': 500, 'classifier__class_weight': 'balanced', 'classifier__C': np.float64(100.0)}
MLflow Run ID: 8c166b8b9e9844459516180c7fb7f251
🏃 View run LogisticRegression-tfidf-Tuning-BestModel at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/8c166b8b9e9844459516180c7fb7f251
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322


In [37]:
print("Type of X_train elements:", type(X_train.iloc[0])) # Check first element type
print("Sample of X_train:", X_train.head())
print("Check for non-string types in X_train:")
# This will check if any element in the Series is not a string
non_string_elements = X_train[X_train.apply(lambda x: not isinstance(x, str))]
if  non_string_elements.any():
    print("Found non-string elements in X_train:", non_string_elements)
else:
    print("non string elements", non_string_elements)

Type of X_train elements: <class 'str'>
Sample of X_train: 196     nq got hit hard lower look like ha found suppo...
2765    company presently examining whether project wo...
4350    meeting glisten shareholder vote bid held 12 m...
4226       value confirmed aircraft order total eur 2bn .
1402    finnish financial software solution developer ...
Name: Sentence, dtype: object
Check for non-string types in X_train:
non string elements Series([], Name: Sentence, dtype: object)


In [38]:
train_and_log_model_with_tuning(
    model_class=XGBClassifier, # Pass the class itself
    model_name="XGBoostClassifier",
    vectorizer_type="bow",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    experiment_id=experiment_Id, # Use the obtained experiment_id
    param_distributions= xgb_param_distributions_bow, #param_distributions,
    n_iter_search=5, 
    cv_folds=2,      
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for XGBoostClassifier with bow...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.97      0.90       871
           1       0.95      0.71      0.81       366
           2       0.80      0.39      0.53        94

    accuracy                           0.86      1331
   macro avg       0.86      0.69      0.75      1331
weighted avg       0.87      0.86      0.85      1331



  "inputs": [
    "nq got hit hard lower look li.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: XGBoostClassifier, Vectorizer: bow
Best CV Score (f1_weighted): 0.8326
Test Accuracy: 0.8603, Test F1 Score (weighted): 0.8506
Best Parameters: {'vectorizer__ngram_range': (1, 1), 'vectorizer__min_df': 1, 'vectorizer__max_df': np.float64(0.8500000000000001), 'classifier__subsample': 0.7, 'classifier__n_estimators': 200, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.05, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 0.9}
MLflow Run ID: 44b850dd11e8492ab79ee6de8d217523
🏃 View run XGBoostClassifier-bow-Tuning-BestModel at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/44b850dd11e8492ab79ee6de8d217523
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322


In [26]:
train_and_log_model_with_tuning(
    model_class=XGBClassifier, 
    model_name="XGBoostClassifier",
    vectorizer_type="tfidf",
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    experiment_id=experiment_Id, # Use the obtained experiment_id
    param_distributions= xgb_param_distributions_tf_idf, #param_distributions,
    n_iter_search=5, # Reduced for quick demo, increase for real tuning
    cv_folds=2,      # Reduced for quick demo
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for XGBoostClassifier with tfidf...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.97      0.91       871
           1       0.95      0.74      0.83       366
           2       0.81      0.47      0.59        94

    accuracy                           0.87      1331
   macro avg       0.87      0.73      0.78      1331
weighted avg       0.88      0.87      0.87      1331



  "inputs": [
    "nq got hit hard lower look li.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: XGBoostClassifier, Vectorizer: tfidf
Best CV Score (f1_weighted): 0.8246
Test Accuracy: 0.8730, Test F1 Score (weighted): 0.8660
Best Parameters: {'vectorizer__use_idf': False, 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 10, 'vectorizer__max_df': np.float64(0.8500000000000001), 'classifier__subsample': 0.9, 'classifier__n_estimators': 200, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.05, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 0.9}
MLflow Run ID: 97a7578c949846c1b45fb1929516b327
🏃 View run XGBoostClassifier-tfidf-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322/runs/97a7578c949846c1b45fb1929516b327
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/408458959843314322
