In [2]:
#http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.utils.class_weight import compute_class_weight
import mlflow
import mlflow.sklearn
import os

In [7]:
#setting tracking 
mlflow.set_tracking_uri("http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/")

In [8]:
experiment_name = "Navie_bayes_experiment"
mlflow.set_experiment(experiment_name) 

2025/09/09 13:09:32 INFO mlflow.tracking.fluent: Experiment with name 'Navie_bayes_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://s3bucmlflow/639483181163671325', creation_time=1757403573455, experiment_id='639483181163671325', last_update_time=1757403573455, lifecycle_stage='active', name='Navie_bayes_experiment', tags={}>

In [9]:
df = pd.read_csv("/root/mlops_projects/FinancialSentiment_prediction/Datasets/Financial_data.csv")

In [10]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,neutral
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",neutral
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [11]:
df.shape

(5842, 2)

In [12]:
df["Sentiment"].value_counts()

Sentiment
neutral     3747
positive    1561
negative     534
Name: count, dtype: int64

In [13]:
df["Sentiment"].value_counts(normalize=True)*100

Sentiment
neutral     64.138993
positive    26.720301
negative     9.140705
Name: proportion, dtype: float64

In [14]:
df.sample(5)

Unnamed: 0,Sentence,Sentiment
4792,GSK joins China trade push as UK trumpets heal...,neutral
3139,The combined value of the orders is almost EUR...,neutral
5179,Our customers come from the following countrie...,neutral
5768,"Simultaneously , Alma Media has purchased a 35...",neutral
3435,Alfa group will have 43.9 % of voting stock in...,positive


In [15]:
df.duplicated().sum()

np.int64(520)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.shape

(5322, 2)

In [18]:
# Split data
x = df["Sentence"]
y = df["Sentiment"]
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# Map sentiments to numerical labels
sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
y_train = y_train.map(sentiment_map)
y_test = y_test.map(sentiment_map)

In [20]:
# Enhanced text preprocessing for financial sentiment
def preprocess_text(text):
    text = text.strip()
    text = text.lower()
    # Retain numbers and basic punctuation, remove only special characters
    text = re.sub(r'http\S+|[^\w\s\d.,]', '', text)  # Keep numbers, commas, periods
    text = re.sub(r'\s+',' ',text) #collapsing multiple spaces to one space
    text = text.strip() #removes white spaces
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    stop_words = set(stopwords.words('english')) - {'not', 'no', 'never', 'very', 'bullish', 'bearish', 'buy', 'sell', 'strong', 'weak', 'profit', 'loss', 'growth'}
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [21]:
res_1 = preprocess_text(x_train[4256])
print(res_1)

raute , headquartered nastola , finland , technology company serving wood product industry worldwide .


In [22]:
x_train = x_train.apply(preprocess_text)

In [23]:
x_train[3443]

'aim increase sale least one fifth 2006 .'

In [24]:
for i, text in enumerate(x_train.head(10)):
    print(f"Original Row {i}: '{text[:5]}...'")

Original Row 0: 'compa...'
Original Row 1: 'inves...'
Original Row 2: 'stora...'
Original Row 3: 'fiska...'
Original Row 4: 'finla...'
Original Row 5: 'liqui...'
Original Row 6: 'refil...'
Original Row 7: 'nd no...'
Original Row 8: 'resid...'
Original Row 9: 'teles...'


In [25]:
x_test = x_test.apply(preprocess_text)

In [26]:
import nlpaug.augmenter.word as naw
from sklearn.utils import shuffle
Augmentation_available = False
try:
    # Initialize augmentor
    aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3, aug_max=5)

    augmented_texts = []
    augmented_labels = []

    # Augment only underrepresented classes (e.g., 0 and 2)
    for label in [0, 2]:
        class_samples = x_train[y_train == label]
        if label == 0:
             mult = 1
        else:
             mult = 0.5
        
        for text in class_samples[:int(mult * len(class_samples))]:
            try:
                new_text = aug.augment(text)
                if isinstance(new_text, list):
                    new_text = " ".join(new_text)
                augmented_texts.append(new_text)
                augmented_labels.append(label)
            except Exception as e:
                print(f"Augmentation failed for label {label}: {e}")

    # Convert to Series
    aug_series = pd.Series(augmented_texts)
    label_series = pd.Series(augmented_labels)

    # Combine with original training data
    x_train = pd.concat([x_train, aug_series], ignore_index=True)
    y_train = pd.concat([y_train, label_series], ignore_index=True)

    # Shuffle
    x_train, y_train = shuffle(x_train, y_train, random_state=42)

    print("✅ Augmentation complete. New shapes:", x_train.shape, y_train.shape)
    Augmentation_available = True

except Exception as e:
        print(f"Augmentation failed: {e}. Proceeding without augmentation.")


✅ Augmentation complete. New shapes: (5142,) (5142,)


In [27]:
x_train.sample(5)

1417    aldata solution , global company engaged suppl...
3967    barclays plc lse barc nyse bcs , credit agrico...
3236                       payment date march 25 , 2010 .
4934               trx long frame up. macd hybridization.
5141    finnish investment group neomarkka oyj hel nem...
dtype: object

In [28]:
y_train.value_counts(normalize=True)*100

1    54.200700
2    34.091793
0    11.707507
Name: proportion, dtype: float64

In [29]:
y_train.value_counts()

1    2787
2    1753
0     602
Name: count, dtype: int64

In [30]:
# Compute class weights with adjustment for imbalance
classes = np.unique(y_train)
print(classes)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
print(class_weights)
# Boost minority classes (negative, neutral) slightly
class_weights = class_weights * np.array([2.0, 1.0, 1.5])  # negative, neutral, positive
class_weight_dict = dict(zip(classes, class_weights))
print(class_weight_dict)

[0 1 2]
[2.84717608 0.61499821 0.97775242]
{np.int64(0): np.float64(5.694352159468439), np.int64(1): np.float64(0.6149982059562253), np.int64(2): np.float64(1.4666286366229322)}


In [35]:
# Function to train and evaluate Naive Bayes variants with MLflow tracking
def train_evaluate_nb(model, model_name, vectorizer, vectorizer_name, is_gaussian=False):
    try:
        with mlflow.start_run(run_name=f"{model_name}_{vectorizer_name}_data-aug_02"):
            # Log parameters
            mlflow.log_param("model_name", model_name)
            mlflow.log_param("vectorizer", vectorizer_name)
            mlflow.log_param("max_features", vectorizer.max_features)
            mlflow.log_param("ngram_range", vectorizer.ngram_range)
            mlflow.log_param("max_df", vectorizer.max_df)
            mlflow.log_param("augmentation", Augmentation_available)
            
            X_train_vec = vectorizer.fit_transform(x_train)
            X_test_vec = vectorizer.transform(x_test)
            
            # Convert to dense array for GaussianNB
            if is_gaussian:
                X_train_vec = X_train_vec.toarray()
                X_test_vec = X_test_vec.toarray()
            
            # Grid search with reduced parallelization
            param_grid = {'alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0]} if not is_gaussian else {}
            grid_search = GridSearchCV(model, param_grid, cv=3, scoring='balanced_accuracy', n_jobs=1)
            grid_search.fit(X_train_vec, y_train)
            
            # Log best parameters
            if not is_gaussian:
                mlflow.log_param("best_alpha", grid_search.best_params_.get('alpha', 'N/A'))
            
            # Best model
            best_model = grid_search.best_estimator_
            print(f"\nBest parameters for {model_name} with {vectorizer_name}: {grid_search.best_params_ if not is_gaussian else 'N/A'}")
            
            # Predict
            y_pred = best_model.predict(X_test_vec)
            
            # Evaluate
            accuracy = accuracy_score(y_test, y_pred)
            balanced_acc = balanced_accuracy_score(y_test, y_pred)
            class_report = classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive'], output_dict=True)
            
            # Log metrics
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("balanced_accuracy", balanced_acc)
            for label, metrics in class_report.items():
                if isinstance(metrics, dict):
                    mlflow.log_metric(f"f1_score_{label}", metrics['f1-score'])
                    mlflow.log_metric(f"precision_{label}", metrics['precision'])
                    mlflow.log_metric(f"recall_{label}", metrics['recall'])
            
            # Log model
            mlflow.sklearn.log_model(best_model, f"model_{model_name}_{vectorizer_name}")
            
            # Save confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
            plt.title(f'Confusion Matrix - {model_name} with {vectorizer_name}')
            plt.xlabel('Predicted')
            plt.ylabel('True')
            cm_path = f"cm_{model_name}_{vectorizer_name}.png"
            plt.savefig(cm_path)
            plt.close()
            mlflow.log_artifact(cm_path)
            os.remove(cm_path)
            
            # Print results
            print(f"\nResults for {model_name} with {vectorizer_name}:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Balanced Accuracy: {balanced_acc:.4f}")
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))
            
            return best_model, vectorizer, accuracy
    except Exception as e:
        print(f"Error training {model_name} with {vectorizer_name}: {e}")
        return None, None, 0


In [36]:
# Define vectorizers
bow_vectorizer = CountVectorizer(max_features=3000, ngram_range=(1, 2), min_df=1, max_df=0.8)
tfidf_vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2), min_df=1, max_df=0.8)
binary_bow_vectorizer = CountVectorizer(max_features=3000, ngram_range=(1, 2), min_df=1, max_df=0.8, binary=True)

In [37]:
# Train and evaluate all Naive Bayes variants
models = [
    (MultinomialNB(class_prior=class_weights), "MultinomialNB", False),
    (ComplementNB(class_prior=class_weights), "ComplementNB", False),
    (BernoulliNB(class_prior=class_weights), "BernoulliNB", False),
    (GaussianNB(), "GaussianNB", True)
]

In [38]:
# Run for BoW and TF-IDF
best_model, best_vectorizer, best_accuracy = None, None, 0
for model, model_name, is_gaussian in models:
    print(f"\nTraining {model_name} with Bag of Words...")
    nb_bow, bow_vec, acc_bow = train_evaluate_nb(model, model_name, bow_vectorizer, "Bag of Words", is_gaussian)
    if nb_bow and acc_bow > best_accuracy:
        best_model, best_vectorizer, best_accuracy = nb_bow, bow_vec, acc_bow
    
    print(f"\nTraining {model_name} with TF-IDF...")
    nb_tfidf, tfidf_vec, acc_tfidf = train_evaluate_nb(model, model_name, tfidf_vectorizer, "TF-IDF", is_gaussian)
    if nb_tfidf and acc_tfidf > best_accuracy:
        best_model, best_vectorizer, best_accuracy = nb_tfidf, tfidf_vec, acc_tfidf


Training MultinomialNB with Bag of Words...

Best parameters for MultinomialNB with Bag of Words: {'alpha': 0.1}





Results for MultinomialNB with Bag of Words:
Accuracy: 0.6441
Balanced Accuracy: 0.6307

Classification Report:
              precision    recall  f1-score   support

    negative       0.22      0.58      0.32        76
     neutral       0.83      0.64      0.72       697
    positive       0.59      0.67      0.63       292

    accuracy                           0.64      1065
   macro avg       0.55      0.63      0.56      1065
weighted avg       0.72      0.64      0.67      1065

🏃 View run MultinomialNB_Bag of Words_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/398c376396494c9fbd11d8b2ae421cdf
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325

Training MultinomialNB with TF-IDF...

Best parameters for MultinomialNB with TF-IDF: {'alpha': 0.01}





Results for MultinomialNB with TF-IDF:
Accuracy: 0.5164
Balanced Accuracy: 0.5839

Classification Report:
              precision    recall  f1-score   support

    negative       0.18      0.63      0.28        76
     neutral       0.84      0.43      0.57       697
    positive       0.46      0.69      0.55       292

    accuracy                           0.52      1065
   macro avg       0.49      0.58      0.47      1065
weighted avg       0.69      0.52      0.54      1065

🏃 View run MultinomialNB_TF-IDF_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/b723de94698b4734967e9f91347c8cf0
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325

Training ComplementNB with Bag of Words...

Best parameters for ComplementNB with Bag of Words: {'alpha': 1.0}





Results for ComplementNB with Bag of Words:
Accuracy: 0.7333
Balanced Accuracy: 0.6828

Classification Report:
              precision    recall  f1-score   support

    negative       0.37      0.58      0.45        76
     neutral       0.85      0.76      0.80       697
    positive       0.64      0.71      0.67       292

    accuracy                           0.73      1065
   macro avg       0.62      0.68      0.64      1065
weighted avg       0.76      0.73      0.74      1065

🏃 View run ComplementNB_Bag of Words_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/5e5ae2b8def041529617a9a2c7b0fbda
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325

Training ComplementNB with TF-IDF...

Best parameters for ComplementNB with TF-IDF: {'alpha': 1.0}





Results for ComplementNB with TF-IDF:
Accuracy: 0.7277
Balanced Accuracy: 0.6811

Classification Report:
              precision    recall  f1-score   support

    negative       0.37      0.59      0.46        76
     neutral       0.85      0.76      0.80       697
    positive       0.63      0.70      0.66       292

    accuracy                           0.73      1065
   macro avg       0.62      0.68      0.64      1065
weighted avg       0.75      0.73      0.74      1065

🏃 View run ComplementNB_TF-IDF_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/f61c08b7b67443c5b5cebc33c01290bc
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325

Training BernoulliNB with Bag of Words...

Best parameters for BernoulliNB with Bag of Words: {'alpha': 0.05}





Results for BernoulliNB with Bag of Words:
Accuracy: 0.7108
Balanced Accuracy: 0.6314

Classification Report:
              precision    recall  f1-score   support

    negative       0.27      0.47      0.35        76
     neutral       0.83      0.76      0.79       697
    positive       0.65      0.66      0.66       292

    accuracy                           0.71      1065
   macro avg       0.59      0.63      0.60      1065
weighted avg       0.74      0.71      0.72      1065

🏃 View run BernoulliNB_Bag of Words_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/23698747c20942269c20f16992426544
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325

Training BernoulliNB with TF-IDF...

Best parameters for BernoulliNB with TF-IDF: {'alpha': 0.05}





Results for BernoulliNB with TF-IDF:
Accuracy: 0.7108
Balanced Accuracy: 0.6314

Classification Report:
              precision    recall  f1-score   support

    negative       0.27      0.47      0.35        76
     neutral       0.83      0.76      0.79       697
    positive       0.65      0.66      0.66       292

    accuracy                           0.71      1065
   macro avg       0.59      0.63      0.60      1065
weighted avg       0.74      0.71      0.72      1065

🏃 View run BernoulliNB_TF-IDF_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/7eea78ff95764006a28d895f85e5bbc6
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325

Training GaussianNB with Bag of Words...

Best parameters for GaussianNB with Bag of Words: N/A





Results for GaussianNB with Bag of Words:
Accuracy: 0.4460
Balanced Accuracy: 0.4499

Classification Report:
              precision    recall  f1-score   support

    negative       0.12      0.26      0.17        76
     neutral       0.76      0.34      0.47       697
    positive       0.37      0.75      0.49       292

    accuracy                           0.45      1065
   macro avg       0.42      0.45      0.38      1065
weighted avg       0.61      0.45      0.45      1065

🏃 View run GaussianNB_Bag of Words_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/f3d63a50d6f04761a129a2e2fdd33fe8
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325

Training GaussianNB with TF-IDF...

Best parameters for GaussianNB with TF-IDF: N/A





Results for GaussianNB with TF-IDF:
Accuracy: 0.4629
Balanced Accuracy: 0.4440

Classification Report:
              precision    recall  f1-score   support

    negative       0.12      0.25      0.16        76
     neutral       0.75      0.39      0.51       697
    positive       0.37      0.69      0.48       292

    accuracy                           0.46      1065
   macro avg       0.41      0.44      0.39      1065
weighted avg       0.60      0.46      0.48      1065

🏃 View run GaussianNB_TF-IDF_data-aug_02 at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325/runs/393e635e38ff4d0aae5f1e68ac93b16e
🧪 View experiment at: http://ec2-65-0-103-88.ap-south-1.compute.amazonaws.com:5000/#/experiments/639483181163671325
