In [None]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
df = pd.read_csv("IMDB.csv")
df.shape

(1000, 2)

In [33]:
df = df.sample(500)
df.shape

(500, 2)

In [34]:
df.to_csv("data.csv", index=False)
df.head()

Unnamed: 0,review,sentiment
825,How can anyone argue the fact that Urban Cowbo...,positive
932,"This should be re-named ""Everybody Loves Sebas...",negative
541,I spent 5 hours drenched in this film. Nothing...,positive
593,"Now I love Bela Lugosi,don't get me wrong,he i...",negative
680,"Growing up with the Beast Wars transformers, I...",negative


## Data preprocessing

In [28]:
# Define text preprocessing functions

def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = text.split()
    text = [word for word in text if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    """Convert text to lower case."""
    text = text.split()
    text = [word.lower() for word in text]
    return " ".join(text)

def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r"http[s]?://\S+|www\.\S+")
    return re.sub(url_pattern, "", text)
    
def normalize_text(df):
    """Normalize the text data."""
    try:
        df['review'] = df['review'].apply(lower_case)
        df['review'] = df['review'].apply(remove_stop_words)
        df['review'] = df['review'].apply(removing_numbers)
        df['review'] = df['review'].apply(removing_punctuations)
        df['review'] = df['review'].apply(removing_urls)
        df['review'] = df['review'].apply(lemmatization)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise



In [29]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
449,currently playing german film festival austral...,positive
203,take place harlem black owned nightclub deal g...,negative
583,wow strange film david lynch movie surprise we...,positive
306,watched love life holiday filmed film festival...,positive
543,like watching right like oh totally awesomeful...,positive


In [36]:
df['sentiment'].value_counts()

sentiment
negative    264
positive    236
Name: count, dtype: int64

In [40]:
x = df['sentiment'].isin(['positive','negative'])
df = df[x]

In [41]:
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})
df.head()

Unnamed: 0,review,sentiment
825,How can anyone argue the fact that Urban Cowbo...,1
932,"This should be re-named ""Everybody Loves Sebas...",0
541,I spent 5 hours drenched in this film. Nothing...,1
593,"Now I love Bela Lugosi,don't get me wrong,he i...",0
680,"Growing up with the Beast Wars transformers, I...",0


In [42]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

## Feature Engineering

vectorizer = CountVectorizer(max_features=100)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [45]:
import dagshub
dagshub.init(repo_owner='vishalchauhan91196', repo_name='MLOps-Production-System', mlflow=True)

mlflow.set_tracking_uri("https://dagshub.com/vishalchauhan91196/MLOps-Production-System.mlflow")
mlflow.set_experiment("Logistic Regression Baseline")

2025/12/16 14:48:24 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/411e33a2042041a99bbf434ce54f4725', creation_time=1765876706045, experiment_id='0', last_update_time=1765876706045, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [None]:
import logging
import os
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

logging.info("Starting MLflow run...")

with mlflow.start_run():
    start_time = time.time()

    try:
        logging.info("Logging preprocessing parameters...")
        mlflow.log_params({"vectorizer": "Bag of Words",
                           "max_features": 100,
                           "test_size": 0.25,
                           "model": "Logistic Regression"})

        logging.info("Initializing Logistic Regression model...")
        model = LogisticRegression(max_iter=1000)
        
        logging.info("Fitting the model...")
        model.fit(X_train, y_train)
        logging.info("Model training complete.")

        logging.info("Making predictions...")
        y_pred = model.predict(X_test)

        logging.info("Calculating evaluation metrics...")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        logging.info("Logging evaluation metrics...")
        mlflow.log_metrics({"accuracy": accuracy,
                            "precision": precision,
                            "recall": recall,
                            "f1_score": f1})

        logging.info("Saving and logging the model...")
        mlflow.sklearn.log_model(model, "model")

        # Log execution time
        end_time = time.time()
        logging.info(f"Model training and logging completed in {end_time - start_time:.2f} seconds.")

        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")


    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)    