In [1]:
import pandas as pd


In [2]:
pd.read_csv("data.csv")

Unnamed: 0,reviews,sentiment
0,Every great gangster movie has under-currents ...,positive
1,"I just saw this film last night, and I have to...",positive
2,This film is mildly entertaining if one neglec...,negative
3,Quentin Tarantino's partner in crime Roger Ava...,negative
4,I sat through this on TV hoping because of the...,negative
...,...,...
495,I was really disappointed by this movie. Great...,negative
496,"This is a great example of a good, dumb movie....",positive
497,Do you know that they want to escavate the Moo...,negative
498,I really wanted to like The Pillow Book. Intri...,negative


In [2]:
from src.constens import CONFIG
import os
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import scipy.sparse
import warnings
nltk.download('stopwords')
nltk.download('wordnet')

def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    return " ".join([word for word in text.split() if word not in stop_words])

def remove_number(text):
    return re.sub(r'\d+', '', text)

def lower(text):
    return text.lower()

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join(lemmatizer.lemmatize(word) for word in text.split())

def remove_punctuation(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", " ", text)

def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def normalize_text(df):
    try:
        return df["reviews"].apply(lower)\
                            .apply(remove_punctuation)\
                            .apply(remove_stop_words)\
                            .apply(remove_number)\
                            .apply(lemmatization)\
                            .apply(remove_url)
    except Exception as e:
        print(f"Error during normalization: {e}")

def load_data(file_path):
    df = pd.read_csv(file_path)
    df["reviews"] = normalize_text(df)
    df = df[df["sentiment"].isin(["positive", "negative"])]
    df['sentiment'] = df["sentiment"].map({"positive": 1, "negative": 0})
    return df

df = load_data("data.csv")
df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suraj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\suraj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,reviews,sentiment
0,every great gangster movie current human drama...,1
1,saw film last night say loved every minute tak...,1
2,film mildly entertaining one neglect acknowled...,0
3,quentin tarantino partner crime roger avary co...,0
4,sat tv hoping name would worth time dear gussi...,0
...,...,...
495,really disappointed movie great actor potentia...,0
496,great example good dumb movie high art mean sc...,1
497,know want escavate moon real geneve debated pr...,0
498,really wanted like pillow book intriguing stor...,0


In [4]:
df.isnull().sum()

reviews      0
sentiment    0
dtype: int64

In [6]:
empty_rows = df['reviews'] == ''
print(df[empty_rows])


Empty DataFrame
Columns: [reviews, sentiment]
Index: []


In [7]:
print(df["reviews"].duplicated().sum())


0


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df["reviews"])
print(vectorizer.get_feature_names_out())


['aag' 'abandon' 'abandoned' ... 'zu' 'zuckers' 'étc']


In [10]:
print(df['sentiment'].value_counts())


sentiment
0    269
1    231
Name: count, dtype: int64


In [22]:
from src.constants import CONFIG
import os
import pandas as pd
import numpy as np
import dagshub
import mlflow
import mlflow.sklearn
import re
import string
from src.logger import logging
from src.Exception import MyException
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import scipy.sparse
import warnings
import sys

warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')
logging.info("nltk libraries downloading done")

# try:
#     mlflow.set_tracking_uri(CONFIG["mlflow_tracking_uri"])
#     dagshub.init(repo_owner=CONFIG["dagshub_repo_owner"], repo_name=CONFIG["dagshub_repo_name"], mlflow=True)

#     mlflow.set_experiment(CONFIG["experiment_name"])
#     logging.info("MLflow and Dagshub initialized successfully.")
# except Exception as e:
#     logging.error(f"Error initializing MLflow and Dagshub: {e}")
#     raise MyException(f"Initialization error: {e}")

def remove_stop_words(text):
    try:
        stop_words = set(stopwords.words("english"))
        return " ".join([word for word in text.split() if word not in stop_words])
    except Exception as e:
        raise MyException(e, sys)

def remove_number(text):
    try:
        return re.sub(r'\d+', '', text)
    except Exception as e:
        raise MyException(e, sys)

def lower(text):
    try:
        return text.lower()
    except Exception as e:
        raise MyException(e, sys)
    
def lemmatization(text):
    try:
        lemmatizer = WordNetLemmatizer()
        return " ".join(lemmatizer.lemmatize(word) for word in text.split())
    except Exception as e:
        raise MyException(e, sys)
    
def remove_punctuation(text):
    try:
        return re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    except Exception as e:
        raise MyException(e, sys)

def remove_url(text):
    try:
        return re.sub(r'https?://\S+|www\.\S+', '', text)
    except Exception as e:
        raise MyException(e, sys)

def normalize_text(df):
    try:
        logging.info("Text normalization initialized")
        df["reviews"] = df["reviews"].apply(lambda x: lower(x))\
                                    .apply(lambda x: remove_punctuation(x))\
                                    .apply(lambda x: remove_stop_words(x))\
                                    .apply(lambda x: remove_number(x))\
                                    .apply(lambda x: lemmatization(x))\
                                    .apply(lambda x: remove_url(x))
        logging.info("Text normalization completed")
        return df
    except Exception as e:
        raise MyException(e, sys)


def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        logging.info("Data loading completed")

        df = normalize_text(df)

        df = df[df["sentiment"].isin(["positive", "negative"])]
        df['sentiment'] = df["sentiment"].map({"positive": 1, "negative": 0})
        logging.info("Target column encoding completed")
        return df
    except Exception as e:
        raise MyException(e, sys)



df = load_data("data.csv")


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[2025-03-18 09:52:29,492] root - INFO - nltk libraries downloading done


[2025-03-18 09:52:29,511] root - INFO - Data loading completed
[2025-03-18 09:52:29,517] root - INFO - Text normalization initialized
[2025-03-18 09:52:29,846] root - INFO - Text normalization completed
[2025-03-18 09:52:29,846] root - INFO - Target column encoding completed


In [14]:
df["sentiment"].value_counts()

sentiment
0    269
1    231
Name: count, dtype: int64

In [13]:
df = df[df["sentiment"].isin(["positive", "negative"])]
df['sentiment'] = df["sentiment"].map({"positive": 1, "negative": 0})

In [19]:
df = pd.read_csv("data.csv")
df.head()


Unnamed: 0,reviews,sentiment
0,Every great gangster movie has under-currents ...,positive
1,"I just saw this film last night, and I have to...",positive
2,This film is mildly entertaining if one neglec...,negative
3,Quentin Tarantino's partner in crime Roger Ava...,negative
4,I sat through this on TV hoping because of the...,negative


In [20]:
df = df[df["sentiment"].isin(["positive", "negative"])]

df['sentiment'] = df["sentiment"].map({"positive": 1, "negative": 0})

In [21]:
df

Unnamed: 0,reviews,sentiment
0,Every great gangster movie has under-currents ...,1
1,"I just saw this film last night, and I have to...",1
2,This film is mildly entertaining if one neglec...,0
3,Quentin Tarantino's partner in crime Roger Ava...,0
4,I sat through this on TV hoping because of the...,0
...,...,...
495,I was really disappointed by this movie. Great...,0
496,"This is a great example of a good, dumb movie....",1
497,Do you know that they want to escavate the Moo...,0
498,I really wanted to like The Pillow Book. Intri...,0
