In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,classification_report
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
from mlflow.models.signature import infer_signature

In [3]:
#setting tracking 
mlflow.set_tracking_uri("http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/")

In [4]:
from mlflow.tracking import MlflowClient 
client = MlflowClient()

# Get all experiments (returns List[Experiment])
experiments_id = client.search_experiments()

for exp in experiments_id:
    print(f"Experiment name :{exp.name} and id:{exp.experiment_id}")

Experiment name :baseline_ML_model and id:408458959843314322
Experiment name :Navie_bayes_experiment and id:639483181163671325
Experiment name :Default and id:0


In [5]:
experiment_name = "ML_model_dataaug"

# Get the experiment (even if soft-deleted)
exp = client.get_experiment_by_name(experiment_name)

if exp:
    runs = client.search_runs(exp.experiment_id)
    print(f"Found {len(runs)} runs in experiment '{experiment_name}' (status: {exp.lifecycle_stage})")
else:
    print("Experiment not found.")

Experiment not found.


In [6]:
mlflow.set_experiment(experiment_name="ML_model_dataaug")

2025/09/09 15:57:53 INFO mlflow.tracking.fluent: Experiment with name 'ML_model_dataaug' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://s3bucmlflow/918242205499858319', creation_time=1757413674527, experiment_id='918242205499858319', last_update_time=1757413674527, lifecycle_stage='active', name='ML_model_dataaug', tags={}>

In [7]:
df = pd.read_csv("/root/mlops_projects/FinancialSentiment_prediction/Datasets/Financial_data.csv")

In [8]:
df.head(5)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,neutral
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",neutral
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [9]:
df.duplicated().sum()

np.int64(520)

In [10]:
df.drop_duplicates(keep="first",inplace=True)

In [11]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet

In [12]:
# Enhanced text preprocessing for financial sentiment
def preprocess_text(text):
    text = text.lower()
    # Retain numbers and basic punctuation, remove only special characters
    text = re.sub(r'http\S+|[^\w\s.]', '', text)  # Keep numbers, commas, periods
    text = re.sub(r'\s+',' ',text) #collapsing multiple spaces to one space
    #text = text.strip() #removes white spaces
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    stop_words = set(stopwords.words('english')) - {'not', 'no', 'never', 'very', 'bullish', 'bearish', 'buy', 'sell', 'strong', 'weak', 'profit', 'loss', 'growth'}
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [13]:
df["Sentence"] = df["Sentence"].apply(preprocess_text)

In [14]:
x = df["Sentence"]
y = df["Sentiment"]

In [15]:
def label_con(data):
    res = data.map(lambda x: 1 if x == "positive" else (0 if x == "neutral" else -1))
    return res

In [16]:
y = label_con(df["Sentiment"])

In [17]:
print(y)

0       0
1      -1
2       0
3       0
4       0
       ..
5835    0
5836    0
5838    0
5839    0
5841    0
Name: Sentiment, Length: 5322, dtype: int64


In [18]:
y.shape

(5322,)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42)

In [20]:
y_train.to_numpy().flatten()

array([1, 0, 0, ..., 0, 0, 1], shape=(3991,))

In [21]:
experiment_name = "ML_model_dataaug"
try:
    experiment = mlflow.get_experiment_by_name(name=experiment_name)
    experiment_Id = experiment.experiment_id
    print(f"experiment_id for {experiment_name} is :{experiment_Id}")
    
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    raise

experiment_id for ML_model_dataaug is :918242205499858319


In [22]:
import nlpaug.augmenter.word as naw
#import nlpaug.augmenter.sentence as nas
from tqdm import tqdm
tqdm.pandas()

In [23]:
# Initialize augmenter
aug = naw.SynonymAug(aug_src='wordnet', 
                     lang='eng', 
                     aug_p=0.4, 
                     aug_max=5)
# Ensure indices are properly aligned
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

# Get indices of samples to augment (sentiment 0 or 2)
indices = y_train[y_train.isin([-1, 1])].index
augmented = []

for idx in tqdm(indices, desc="Augmenting"):
    try:
        # Access sentence (Series: x_train.loc[idx], DataFrame: x_train.loc[idx, 'Sentence'])
        sentence = x_train.loc[idx] if isinstance(x_train, pd.Series) else x_train.loc[idx, 'Sentence']
        sentiment = y_train.loc[idx]
        
        new_text = aug.augment(sentence)
        if isinstance(new_text,list):
            new_text = "".join(new_text)
        augmented.append({'Sentence': new_text, 
                          'Sentiment': sentiment,
                          "Source": "Augmented"})
    except Exception as e:
        print(f"Skipping index {idx}: {str(e)}")

# Combine with original data
if augmented:
    aug_df = pd.DataFrame(augmented)
    
    # Handle x_train (Series or DataFrame)
    original_df = pd.DataFrame({"Sentence" : x_train, 
                            "Sentiment" : y_train, 
                            "Source" : "Original"})
    
    combined_df = pd.concat([original_df,aug_df],ignore_index=True)

    # Shuffle rows (frac=1 means "return all rows in random order")
    df_final = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    

    #update x_train and y_train
    x_train = df_final["Sentence"]
    # Always treat y_train as Series
    y_train = df_final["Sentiment"]
    
    print(f"Data augmented. Total samples: {len(x_train)}")
else:
    print("No augmentations added.")

Augmenting: 100%|██████████| 1378/1378 [00:01<00:00, 1076.08it/s]

Data augmented. Total samples: 5369





In [24]:
y_train.value_counts()

Sentiment
 0    2613
 1    2190
-1     566
Name: count, dtype: int64

In [25]:
y_train.shape

(5369,)

In [26]:
def train_and_log_model_with_tuning(model_class, model_name, vectorizer_type,
                                    x_train, x_test, y_train, y_test,
                                    experiment_id, param_distributions,
                                    n_iter_search, cv_folds, scoring_metric):
    """
    Trains and logs a model after performing RandomizedSearchCV for hyperparameter tuning.
    The best model from the search is logged.
    """
    run_name_prefix = f"{model_name}-{vectorizer_type}-Tuning"

    x_train = x_train.astype(str).fillna('')
    x_test = x_test.astype(str).fillna('')

    if model_class == XGBClassifier: #as classifier not accept negative labels
        y_train = y_train.map({0:0,1:1,-1:2})
        y_test  = y_test.map({0:0,1:1,-1:2})

    # Define the base vectorizer based on type
    if vectorizer_type == "bow":
        vectorizer_instance = CountVectorizer()
    elif vectorizer_type == "tfidf":
        vectorizer_instance = TfidfVectorizer()
    else:
        raise ValueError("Invalid vectorizer type")

    # Instantiate the classifier
    if model_class == XGBClassifier:
        classifier_instance = XGBClassifier(
            #use_label_encoder=False,  # Recommended to suppress future warnings
            eval_metric='mlogloss',   # Multi-class logloss for multi-class classification
            objective='multi:softmax', # For direct class predictions
            num_class=len(y.unique()), # Important: Set number of classes
            random_state=42
        )
    else:
        classifier_instance = model_class(random_state=42) # Setting random_state for reproducibility

    # Create the pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer_instance),
        ('classifier', classifier_instance)
    ])

    # Initialize RandomizedSearchCV
    # We pass the pipeline here, and the param_distributions should be prefixed
    # e.g., 'classifier__n_estimators'
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=n_iter_search,
        cv=cv_folds,
        scoring=scoring_metric,
        random_state=42, # For reproducible search results
        n_jobs=1, #
        verbose=1
    )

    print(f"\nStarting RandomizedSearchCV for {model_name} with {vectorizer_type}...")
    random_search.fit(x_train, y_train)

    # Get the best model found by the search
    best_pipeline = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_

    # Make predictions with the best model
    y_pred = best_pipeline.predict(x_test)

    # Calculate final metrics on the test set
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Start MLflow run for logging the BEST model from the search
    with mlflow.start_run(experiment_id=experiment_id, run_name=f"{run_name_prefix}-BestModel"):

        mlflow.log_param("model_name", model_name)
        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("tuning_method", "RandomizedSearchCV")
        mlflow.log_param("n_iter_search", n_iter_search)
        mlflow.log_param("cv_folds", cv_folds)
        mlflow.log_param("scoring_metric", scoring_metric)

        # Log all best parameters found by the search
        for k, v in best_params.items():
            mlflow.log_param(k, v)

        # Log metrics (best CV score and test set performance)
        mlflow.log_metric(f"best_cv_{scoring_metric}", best_score)
        mlflow.log_metric("test_accuracy", acc)
        mlflow.log_metric("test_f1_score_weighted", f1)

         # Directly log the full classification report as a text file
        full_report_str = classification_report(y_test, y_pred)
        mlflow.log_text(full_report_str, "classification_report.txt")

        print("\nClassification Report:\n", full_report_str)



        # Log vectorizer specific parameters (from the best pipeline's vectorizer)
        best_vectorizer = best_pipeline.named_steps['vectorizer']
        mlflow.log_param("vectorizer_vocab_size", len(best_vectorizer.vocabulary_))
        if hasattr(best_vectorizer, "get_params"):
            for k, v in best_vectorizer.get_params().items():
                # Filter out parameters that aren't typically interesting to log at top level
                if not (k.startswith("input") or k.startswith("dtype")):
                    mlflow.log_param(f"vectorizer_{k}", v)


        # Log the entire best pipeline
        # Signature for the entire pipeline: Input is raw text, Output is prediction
        pipeline_input_example = x_train[:2].astype('str').fillna('').to_list() # Ensure input example is raw text
        pipeline_output_example = best_pipeline.predict(pipeline_input_example)
        pipeline_signature = infer_signature(pipeline_input_example, pipeline_output_example)

        mlflow.sklearn.log_model(
            sk_model=best_pipeline,
            name= "text_classification_pipeline",
            signature=pipeline_signature,
            input_example=pipeline_input_example
        )

        print(f"\n--- Best Model from Tuning ---")
        print(f"Model: {model_name}, Vectorizer: {vectorizer_type}")
        print(f"Best CV Score ({scoring_metric}): {best_score:.4f}")
        print(f"Test Accuracy: {acc:.4f}, Test F1 Score (weighted): {f1:.4f}")
        print(f"Best Parameters: {best_params}")
        print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")

In [27]:
# Random Forest parameters 
rf_param_distributions_bow = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', 0.5, 0.7],
    'classifier__class_weight': [None, 'balanced']
}

rf_param_distributions_tfidf = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__use_idf': [True, False],
    'classifier__n_estimators': [100, 250, 400],
    'classifier__max_depth': [15, 25, None],
    'classifier__min_samples_split': [2, 5, 8],
    'classifier__min_samples_leaf': [1, 2, 3],
    'classifier__max_features': ['sqrt', 0.6, 0.8],
    'classifier__class_weight': [None, 'balanced']
}

In [28]:
lr_param_distributions_tfidf = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2),(1,3)],        
    'classifier__C': np.logspace(-3, 2, 6), # Example: 0.001, 0.01, 0.1, 1, 10, 100
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear','lbfgs', 'saga'], # Solvers supporting l1/l2
    'classifier__max_iter': [100, 200, 500], # Increase if convergence warnings appear
    'classifier__class_weight': [None, 'balanced']
}

lr_param_distributions_bow = {
    'vectorizer__max_df': np.arange(0.7, 1.0, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],         
    'classifier__C': np.logspace(-4, 3, 8), # Broader range for C
    'classifier__penalty': ['l2'], # Common choice for Logistic Regression
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag'], # Solvers supporting only l2
    'classifier__max_iter': [100, 300, 1000],
    'classifier__class_weight': [None, 'balanced']
}


In [29]:
xgb_param_distributions_bow = {
    'vectorizer__max_df': np.arange(0.7, 0.99, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.7, 0.9],
    'classifier__colsample_bytree': [0.7, 0.9],
    'classifier__gamma': [0, 0.1, 0.2] # Min loss reduction to make a split
}

xgb_param_distributions_tf_idf = {
    'vectorizer__max_df': np.arange(0.7, 0.99, 0.05),
    'vectorizer__min_df': [1, 5, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__use_idf': [True, False], # Important for TF-IDF
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.7, 0.9],
    'classifier__colsample_bytree': [0.7, 0.9],
    'classifier__gamma': [0, 0.1, 0.2] # Min loss reduction to make a split
}


In [30]:
print("--- Running Random Forest Tuning ---")
train_and_log_model_with_tuning(
    model_class=RandomForestClassifier,
    model_name="RandomForestClassifier",
    vectorizer_type="bow",
    x_train=x_train, y_train=y_train,
    x_test=x_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=rf_param_distributions_bow,
    n_iter_search=5, 
    cv_folds=5,      
    scoring_metric='f1_weighted'
)

--- Running Random Forest Tuning ---

Starting RandomizedSearchCV for RandomForestClassifier with bow...
Fitting 5 folds for each of 5 candidates, totalling 25 fits

Classification Report:
               precision    recall  f1-score   support

          -1       0.80      0.59      0.67        94
           0       0.86      0.95      0.90       871
           1       0.89      0.73      0.80       366

    accuracy                           0.86      1331
   macro avg       0.85      0.76      0.79      1331
weighted avg       0.87      0.86      0.86      1331



  "inputs": [
    "poyry plc additional informat.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: RandomForestClassifier, Vectorizer: bow
Best CV Score (f1_weighted): 0.7850
Test Accuracy: 0.8648, Test F1 Score (weighted): 0.8602
Best Parameters: {'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 5, 'vectorizer__max_df': np.float64(0.75), 'classifier__n_estimators': 500, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 30, 'classifier__class_weight': 'balanced'}
MLflow Run ID: 7357223ab46643dcbf76ebf6934927a4
🏃 View run RandomForestClassifier-bow-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319/runs/7357223ab46643dcbf76ebf6934927a4
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319


In [31]:
train_and_log_model_with_tuning(
    model_class=RandomForestClassifier,
    model_name="RandomForestClassifier",
    vectorizer_type="tfidf",
    x_train=x_train, y_train=y_train,
    x_test=x_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=rf_param_distributions_tfidf,
    n_iter_search=5,
    cv_folds=5,
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for RandomForestClassifier with tfidf...
Fitting 5 folds for each of 5 candidates, totalling 25 fits

Classification Report:
               precision    recall  f1-score   support

          -1       0.56      0.57      0.57        94
           0       0.88      0.90      0.89       871
           1       0.84      0.79      0.82       366

    accuracy                           0.85      1331
   macro avg       0.76      0.76      0.76      1331
weighted avg       0.85      0.85      0.85      1331



  "inputs": [
    "poyry plc additional informat.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: RandomForestClassifier, Vectorizer: tfidf
Best CV Score (f1_weighted): 0.8127
Test Accuracy: 0.8490, Test F1 Score (weighted): 0.8487
Best Parameters: {'vectorizer__use_idf': True, 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 10, 'vectorizer__max_df': np.float64(0.75), 'classifier__n_estimators': 250, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 3, 'classifier__max_features': 0.6, 'classifier__max_depth': None, 'classifier__class_weight': 'balanced'}
MLflow Run ID: 55ea5d9c60534d6f86a7592d0a0b111d
🏃 View run RandomForestClassifier-tfidf-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319/runs/55ea5d9c60534d6f86a7592d0a0b111d
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319


In [32]:
train_and_log_model_with_tuning(
    model_class=LogisticRegression,
    model_name="LogisticRegression",
    vectorizer_type="bow",
    x_train=x_train, y_train=y_train,
    x_test=x_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=lr_param_distributions_bow,
    n_iter_search=10,
    cv_folds=5,
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for LogisticRegression with bow...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Classification Report:
               precision    recall  f1-score   support

          -1       0.75      0.51      0.61        94
           0       0.85      0.95      0.90       871
           1       0.89      0.73      0.80       366

    accuracy                           0.86      1331
   macro avg       0.83      0.73      0.77      1331
weighted avg       0.86      0.86      0.85      1331



  "inputs": [
    "poyry plc additional informat.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: LogisticRegression, Vectorizer: bow
Best CV Score (f1_weighted): 0.8959
Test Accuracy: 0.8573, Test F1 Score (weighted): 0.8517
Best Parameters: {'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 1, 'vectorizer__max_df': np.float64(0.8500000000000001), 'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__max_iter': 100, 'classifier__class_weight': 'balanced', 'classifier__C': np.float64(10.0)}
MLflow Run ID: 2cf09150f5e84f449176605483f3c45e
🏃 View run LogisticRegression-bow-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319/runs/2cf09150f5e84f449176605483f3c45e
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319


In [33]:
train_and_log_model_with_tuning(
    model_class=LogisticRegression,
    model_name="LogisticRegression",
    vectorizer_type="tfidf",
    x_train=x_train, y_train=y_train,
    x_test=x_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=lr_param_distributions_tfidf,
    n_iter_search=10,
    cv_folds=5,
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for LogisticRegression with tfidf...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/root/mlops_projects/FinancialSentiment_prediction/fin_ops/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/mlops_projects/FinancialSentiment_prediction/fin_ops/lib/python3.10/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/root/mlops_projects/FinancialSentiment_prediction/fin_ops/lib/python3.10/site-packages/sklearn/pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_p


Classification Report:
               precision    recall  f1-score   support

          -1       0.41      0.48      0.44        94
           0       0.87      0.80      0.84       871
           1       0.69      0.80      0.74       366

    accuracy                           0.78      1331
   macro avg       0.66      0.69      0.67      1331
weighted avg       0.79      0.78      0.78      1331



  "inputs": [
    "poyry plc additional informat.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: LogisticRegression, Vectorizer: tfidf
Best CV Score (f1_weighted): 0.7819
Test Accuracy: 0.7776, Test F1 Score (weighted): 0.7821
Best Parameters: {'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 10, 'vectorizer__max_df': np.float64(0.7), 'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__max_iter': 500, 'classifier__class_weight': 'balanced', 'classifier__C': np.float64(10.0)}
MLflow Run ID: faf0a817165448ae8259351bc30d11f8
🏃 View run LogisticRegression-tfidf-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319/runs/faf0a817165448ae8259351bc30d11f8
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319


In [34]:
train_and_log_model_with_tuning(
    model_class=LogisticRegression,
    model_name="LogisticRegression",
    vectorizer_type="tfidf",
    x_train=x_train, y_train=y_train,
    x_test=x_test, y_test=y_test,
    experiment_id=experiment_Id,
    param_distributions=lr_param_distributions_tfidf,
    n_iter_search=10,
    cv_folds=5,
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for LogisticRegression with tfidf...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


35 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/root/mlops_projects/FinancialSentiment_prediction/fin_ops/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/mlops_projects/FinancialSentiment_prediction/fin_ops/lib/python3.10/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/root/mlops_projects/FinancialSentiment_prediction/fin_ops/lib/python3.10/site-packages/sklearn/pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_p


Classification Report:
               precision    recall  f1-score   support

          -1       0.41      0.48      0.44        94
           0       0.87      0.80      0.84       871
           1       0.69      0.80      0.74       366

    accuracy                           0.78      1331
   macro avg       0.66      0.69      0.67      1331
weighted avg       0.79      0.78      0.78      1331



  "inputs": [
    "poyry plc additional informat.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: LogisticRegression, Vectorizer: tfidf
Best CV Score (f1_weighted): 0.7819
Test Accuracy: 0.7776, Test F1 Score (weighted): 0.7821
Best Parameters: {'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 10, 'vectorizer__max_df': np.float64(0.7), 'classifier__solver': 'saga', 'classifier__penalty': 'l2', 'classifier__max_iter': 500, 'classifier__class_weight': 'balanced', 'classifier__C': np.float64(10.0)}
MLflow Run ID: 919eb9080d864d7f96d58629bb888bfc
🏃 View run LogisticRegression-tfidf-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319/runs/919eb9080d864d7f96d58629bb888bfc
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319


In [36]:
train_and_log_model_with_tuning(
    model_class=XGBClassifier,
    model_name="XGBoostClassifier",
    vectorizer_type="bow",
    x_train=x_train,
    x_test=x_test,
    y_train=y_train,
    y_test=y_test,
    experiment_id=experiment_Id, # Use the obtained experiment_id
    param_distributions= xgb_param_distributions_bow, #param_distributions,
    n_iter_search=10, 
    cv_folds=5,      
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for XGBoostClassifier with bow...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       871
           1       0.92      0.78      0.84       366
           2       0.86      0.54      0.67        94

    accuracy                           0.88      1331
   macro avg       0.88      0.76      0.81      1331
weighted avg       0.88      0.88      0.88      1331



  "inputs": [
    "poyry plc additional informat.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: XGBoostClassifier, Vectorizer: bow
Best CV Score (f1_weighted): 0.8157
Test Accuracy: 0.8820, Test F1 Score (weighted): 0.8773
Best Parameters: {'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 5, 'vectorizer__max_df': np.float64(0.7), 'classifier__subsample': 0.9, 'classifier__n_estimators': 100, 'classifier__max_depth': 7, 'classifier__learning_rate': 0.1, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 0.9}
MLflow Run ID: 089a0509612e4053beba6c0429cbd26d
🏃 View run XGBoostClassifier-bow-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319/runs/089a0509612e4053beba6c0429cbd26d
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319


In [35]:
train_and_log_model_with_tuning(
    model_class=XGBClassifier, 
    model_name="XGBoostClassifier",
    vectorizer_type="tfidf",
    x_train=x_train,
    x_test=x_test,
    y_train=y_train,
    y_test=y_test,
    experiment_id=experiment_Id, # Use the obtained experiment_id
    param_distributions= xgb_param_distributions_tf_idf, #param_distributions,
    n_iter_search=10, 
    cv_folds=5,      
    scoring_metric='f1_weighted'
)


Starting RandomizedSearchCV for XGBoostClassifier with tfidf...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.91       871
           1       0.93      0.77      0.84       366
           2       0.84      0.52      0.64        94

    accuracy                           0.88      1331
   macro avg       0.88      0.75      0.80      1331
weighted avg       0.88      0.88      0.88      1331



  "inputs": [
    "poyry plc additional informat.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: 'int' object has no attribute 'lower'



--- Best Model from Tuning ---
Model: XGBoostClassifier, Vectorizer: tfidf
Best CV Score (f1_weighted): 0.7935
Test Accuracy: 0.8813, Test F1 Score (weighted): 0.8760
Best Parameters: {'vectorizer__use_idf': False, 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 10, 'vectorizer__max_df': np.float64(0.8500000000000001), 'classifier__subsample': 0.9, 'classifier__n_estimators': 200, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.05, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 0.9}
MLflow Run ID: b4db4060c54d476bb6892bcd1ed112c4
🏃 View run XGBoostClassifier-tfidf-Tuning-BestModel at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319/runs/b4db4060c54d476bb6892bcd1ed112c4
🧪 View experiment at: http://ec2-43-205-211-96.ap-south-1.compute.amazonaws.com:5000/#/experiments/918242205499858319
