In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split

reviews = pd.read_csv("reviews.csv")
reviews['sentiment'] = reviews['sentiment'].apply(lambda x: 0 if x == "negative" else 1)
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews['review'],
                                                    reviews['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42)

reviews_train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
reviews_test = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

print("len(reviews_train), len(reviews_test)", len(reviews_train), len(reviews_test))
reviews_train.head()

len(reviews_train), len(reviews_test) 40000 10000


Unnamed: 0,review,sentiment
0,That's what I kept asking myself during the ma...,0
1,I did not watch the entire movie. I could not ...,0
2,A touching love story reminiscent of In the M...,1
3,This latter-day Fulci schlocker is a totally a...,0
4,"First of all, I firmly believe that Norwegian ...",0


In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from bs4 import BeautifulSoup
import string
import re
import nltk

# Download stopwords if not already downloaded
# nltk.download('stopwords')
# nltk.download('punkt')

tokenizer = ToktokTokenizer()
stopword_list = stopwords.words('english') + list(string.punctuation)

def clean_reviews(text):

    # Remove HTML tags
    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    # Remove text within square brackets
    def remove_between_square_brackets(text):
        return re.sub(r'\[[^]]*\]', '', text)
    
    # Remove special characters and optionally digits
    def remove_special_characters(text, remove_digits=True):
        pattern = r'[^a-zA-Z0-9\s]' if remove_digits else r'[^a-zA-Z\s]'
        return re.sub(pattern, '', text)

    # Perform stemming
    def simple_stemmer(text):
        ps = nltk.PorterStemmer()
        text = ' '.join([ps.stem(word) for word in text.split()])
        return text
    
    def remove_stopwords(text, is_lower_case=False):
        tokens = tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
        # filtered_tokens = ' '.join(filtered_tokens)
        return filtered_tokens

    # Clean text step by step
    # cleaned_text = strip_html(text)
    # cleaned_text = remove_between_square_brackets(cleaned_text)
    # cleaned_text = remove_special_characters(cleaned_text)
    # cleaned_text = simple_stemmer(cleaned_text) 
    cleaned_text = remove_stopwords(text)

    return cleaned_text

# Example usage
reviews_train['review'] = reviews_train['review'].apply(clean_reviews)
reviews_test['review'] = reviews_test['review'].apply(clean_reviews)

reviews_train.head()



Unnamed: 0,review,sentiment
0,"[kept, asking, many, fights, screaming, matche...",0
1,"[watch, entire, movie., could, watch, entire, ...",0
2,"[touching, love, story, reminiscent, In, Mood...",1
3,"[latter-day, Fulci, schlocker, totally, abysma...",0
4,"[First, firmly, believe, Norwegian, movies, co...",0


### Naive Bayes Model

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Create the Vocabulary
corpus = reviews_train['review'].apply(lambda x: ' '.join(x))  

vectorizer = CountVectorizer(min_df=15, max_df=0.1) 
X_train_wc = vectorizer.fit_transform(corpus)  # Fit and transform the corpus

vocab_counts = pd.DataFrame(X_train_wc.toarray(), columns=vectorizer.get_feature_names_out())
reviews_train = pd.concat([reviews_train, vocab_counts], axis=1)


reviews_train = reviews_train.loc[:, ~reviews_train.columns.duplicated()]

vocab = vectorizer.get_feature_names_out().tolist()
n_vocab = len(vocab)


p_0 = reviews_train[reviews_train['sentiment'] == 0].shape[0] / reviews_train.shape[0]
p_1 = 1- p_0


In [5]:
n_0 = reviews_train.loc[reviews_train['sentiment'] == 0, 'review'].apply(len).sum()
n_1 = reviews_train.loc[reviews_train['sentiment'] == 1, 'review'].apply(len).sum()

# Step 3: Initialize Parameters
parameters_0 = {word:0 for word in vocab}
parameters_1 = {word:0 for word in vocab}


In [6]:
# Step 4: Calculate Parameters for Each Word
import numpy as np

for word in vocab:
    
    n_word_given_0 = reviews_train.loc[reviews_train['sentiment'] == 0, word].sum()
    n_word_given_1 = reviews_train.loc[reviews_train['sentiment'] == 1, word].sum()

    if type(n_word_given_0) == np.int64 and type(n_word_given_1) == np.int64:
        # # Apply Laplace smoothing
        parameters_0[word] = (n_word_given_0 + 1) / (n_0 + n_vocab)
        parameters_1[word] = (n_word_given_1 + 1) / (n_1 + n_vocab)

In [8]:






# Results
import math

def predict(text):

    cleaned_text = clean_reviews(text)
    log_p_0_given_x = math.log(p_0)
    log_p_1_given_x = math.log(p_1)
    for word in cleaned_text:
        log_p_0_given_x += math.log(parameters_0.get(word, 1))  # Thêm giá trị nhỏ để tránh log(0)
        log_p_1_given_x += math.log(parameters_1.get(word, 1))
    
    return 0 if log_p_0_given_x > log_p_1_given_x else 1


reviews_test['predict'] = reviews_test['review'].apply(predict)
reviews_test['accuracy'] = reviews_test['predict'] == reviews_test['sentiment']
reviews_test['accuracy'].value_counts(normalize=True)

ValueError: math domain error

accuracy
True     0.500975
False    0.499025
Name: proportion, dtype: float64

In [None]:
text = "This movie is very good !"
prediction = predict(text)
print("prediction:", prediction)

prediction: 0
