In [40]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import pandas as pd
import re
import string 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import numpy as np
import os

In [3]:
df=pd.read_csv("IMDB.csv")
df=df.sample(500)
df.to_csv('data.csv',index=False)
df.head()


Unnamed: 0,review,sentiment
566,"Having read another review, I thought this mov...",negative
667,I saw the premier of this movie during the 200...,positive
2,The `plot' of this film contains a few holes y...,negative
302,"DOCTEUR PETIOT, starring Michel Serrault, is a...",positive
679,This show was so exhausting to watch and there...,positive


In [10]:
def lemmatization(text):
    lemmatizer=WordNetLemmatizer()
    text=text.split()
    text=[lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    stop_words=set(stopwords.words("english"))
    text=[word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    text=''.join([char for char in text if not char.isdigit()])
    return text 

def lower_case(text):
    text=text.split()
    text=[word.lower() for word in text]
    return " ".join(text)

def removing_punctuations(text):
    text=re.sub('[%s]' % re.escape(string.punctuation),' ',text)
    text=re.sub('\s+',' ',text).strip()
    return text 

def removing_urls(text):
    url_pattern=re.compile(r'https?://S+|www\.\S+')
    return url_pattern.sub(r'',text)

def normalize_text(df):
    try:
        df['review']=df['review'].apply(lower_case)
        df['review']=df['review'].apply(remove_stop_words)
        df['review']=df['review'].apply(removing_numbers)
        df['review']=df['review'].apply(removing_punctuations)
        df['review']=df['review'].apply(removing_urls)
        df['review']=df['review'].apply(lemmatization)
        return df 
    
    except Exception as e:
        print(f"Error during text normalization {e}")
        raise

  text=re.sub('\s+',' ',text).strip()


In [11]:
df=normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
566,read another review thought movie would actual...,negative
667,saw premier movie phoenix film festival impres...,positive
2,plot film contains hole could drive massive tr...,negative
302,docteur petiot starring michel serrault brutal...,positive
679,show exhausting watch there s two number drown...,positive


In [12]:
df['sentiment'].value_counts()

sentiment
negative    265
positive    235
Name: count, dtype: int64

In [13]:
x=df['sentiment'].isin(['positive','negative'])
df=df[x]

In [14]:
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})
df.head()

Unnamed: 0,review,sentiment
566,read another review thought movie would actual...,0
667,saw premier movie phoenix film festival impres...,1
2,plot film contains hole could drive massive tr...,0
302,docteur petiot starring michel serrault brutal...,1
679,show exhausting watch there s two number drown...,1


In [15]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [16]:
vectorizer=CountVectorizer(max_features=100)
X=vectorizer.fit_transform(df['review'])
y=df['sentiment']

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.20,random_state=40)

In [47]:

mlflow.set_tracking_uri("file:///C:/Users/ZAID/Desktop/MLops/Movie-Analysis-MLOPS/mlruns")
mlflow.set_experiment("LogisticRegression Baseline")

2025/05/26 01:06:00 INFO mlflow.tracking.fluent: Experiment with name 'LogisticRegression Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/ZAID/Desktop/MLops/Movie-Analysis-MLOPS/mlruns/208724811623814255', creation_time=1748201760181, experiment_id='208724811623814255', last_update_time=1748201760181, lifecycle_stage='active', name='LogisticRegression Baseline', tags={}>

In [34]:
import mlflow
import os
import logging
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [48]:
import mlflow.sklearn


logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting mlflow run")


with mlflow.start_run():
    start_time=time.time()
    try:
        logging.info("Logging preprocessing parameters ....")
        mlflow.log_param("vectorizer","Bag of words")
        mlflow.log_param("num_features",100)
        mlflow.log_param("test_size",0.25)

        logging.info("Initalizing logistic model")
        model=LogisticRegression(max_iter=100)

        logging.info("Fitting the model")
        model.fit(X_train,y_train)
        logging.info("Model training completed")

        logging.info("Logging model parameters")
        mlflow.log_param("model","LogisticRegression")
        
        y_pred=model.predict(X_test)

        logging.info("Calcualtion evaluation metrics")
        accuracy=accuracy_score(y_test,y_pred)
        precision=precision_score(y_test,y_pred)
        recall=recall_score(y_test,y_pred)
        f1=f1_score(y_test,y_pred)

        logging.info("Logging the model")
        mlflow.sklearn.log_model(model,"model")

        logging.info(f"Accuracy : {accuracy}")
        logging.info(f"Precision : {precision}")
        logging.info(f"Recall : {recall}")
        logging.info(f"f1 score : {f1}")

    except Exception as e:
        logging.error(f"An error occured while trainig model {e}")



2025-05-26 01:06:04,605 - INFO - Starting mlflow run
2025-05-26 01:06:04,699 - INFO - Logging preprocessing parameters ....
2025-05-26 01:06:04,710 - INFO - Initalizing logistic model
2025-05-26 01:06:04,712 - INFO - Fitting the model
2025-05-26 01:06:04,734 - INFO - Model training completed
2025-05-26 01:06:04,735 - INFO - Logging model parameters
2025-05-26 01:06:04,741 - INFO - Calcualtion evaluation metrics
2025-05-26 01:06:04,752 - INFO - Logging the model
2025-05-26 01:06:11,468 - INFO - Accuracy : 0.59
2025-05-26 01:06:11,469 - INFO - Precision : 0.5827814569536424
2025-05-26 01:06:11,470 - INFO - Recall : 0.4656084656084656
2025-05-26 01:06:11,471 - INFO - f1 score : 0.5176470588235295
