In [2]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-kindle-book-review-for-sentiment-analysis:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1569305%2F2583418%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240904%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240904T233137Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D00b07a3d9a650e6c246abf63d6b3a613b3fd9a4c2330f683acc011f39f3d5642ccc9f7231cc558c98379ef4dcee3e4c869d5e4eef10b476814abe81681b363384f49867ecf6d35b05af3c260fa5085748499db47fb878b2923a9b448806f9957f82a47aa95423afb9d5fe6f3e9f145c06afb596b2bfb6896ab495a221c457a4677c73a8eaaa63cb0d741c3ecfd686dc31cd579fe35031a192abe3dc4ec7616bf3c92a2c38ad64ecd709a16625e636ddc626be4c16c9c3744abf145cc1510462e1d08128680a1b9ad10e1f8b95967783d8d6c3f4216cdc1eeffd26af82c36011b0b7ee133ee0a91def6a90ebce834ed68dc29333c38bccc528d045024e7848947'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading amazon-kindle-book-review-for-sentiment-analysis, 6686485 bytes compressed
Downloaded and uncompressed: amazon-kindle-book-review-for-sentiment-analysis
Data source import complete.


In [29]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer



In [30]:
# loading our dataset

data=pd.read_csv('../input/amazon-kindle-book-review-for-sentiment-analysis/preprocessed_kindle_review .csv')

In [31]:
data.head()

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary
0,0,5,This book was the very first bookmobile book I...,50 + years ago...
1,1,1,"When I read the description for this book, I c...",Boring! Boring! Boring!
2,2,5,I just had to edit this review. This book is a...,Wiggleliscious/new toy ready/!!
3,3,5,I don't normally buy 'mystery' novels because ...,Very good read.
4,4,5,"This isn't the kind of book I normally read, a...",Great Story!


In [32]:
data =data[['reviewText','rating']]

In [33]:
data.head()

Unnamed: 0,reviewText,rating
0,This book was the very first bookmobile book I...,5
1,"When I read the description for this book, I c...",1
2,I just had to edit this review. This book is a...,5
3,I don't normally buy 'mystery' novels because ...,5
4,"This isn't the kind of book I normally read, a...",5


In [34]:
data.shape

(12000, 2)

In [35]:
data.isnull().sum()

Unnamed: 0,0
reviewText,0
rating,0


In [36]:
data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,3000
4,3000
1,2000
3,2000
2,2000


**Preprocessing**

In [37]:
data['rating'] = data['rating'].apply( lambda x:1 if x<3 else 1)

In [38]:
data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,12000


In [39]:
data.head()

Unnamed: 0,reviewText,rating
0,This book was the very first bookmobile book I...,1
1,"When I read the description for this book, I c...",1
2,I just had to edit this review. This book is a...,1
3,I don't normally buy 'mystery' novels because ...,1
4,"This isn't the kind of book I normally read, a...",1


In [40]:
from bs4 import BeautifulSoup

In [41]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function: returns a cleaned, tokenized, lemmatized text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words back into a single string
    preprocessed_text = ' '.join(lemmatized_words)

    return preprocessed_text

data = pd.DataFrame(data)

# Apply the preprocessing function to each review and replace the original text
data['reviewText'] = data['reviewText'].apply(preprocess_text)

# Display the preprocessed DataFrame
print(data)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              reviewText  rating
0      book first bookmobile book bought school book ...       1
1      read description book couldnt wait read downlo...       1
2      edit review book believe got right updated rew...       1
3      dont normally buy mystery novel dont like howe...       1
4      isnt kind book normally read although try limi...       1
...                                                  ...     ...
11995  read certain passage twicetypos wish built rel...       1
11996  expected yet interesting book usually dont rea...       1
11997  dragon knight world knight ride dragon slay wi...       1
11998  since story short hard say much without giving...       1
11999  amazing collection info symbol culture around ...       1

[12000 rows x 2 columns]


In [42]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function to clean and lemmatize text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]


    preprocessed_text = ' '.join(lemmatized_words)

    return preprocessed_text


data['reviewText'] = data['reviewText'].apply(preprocess_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [43]:
data.head()

Unnamed: 0,reviewText,rating
0,book first bookmobile book bought school book ...,1
1,read description book couldnt wait read downlo...,1
2,edit review book believe got right updated rew...,1
3,dont normally buy mystery novel dont like howe...,1
4,isnt kind book normally read although try limi...,1


In [44]:
data['rating'].head()

Unnamed: 0,rating
0,1
1,1
2,1
3,1
4,1


In [75]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test= train_test_split(data['reviewText'] ,data['rating'],
                                                 test_size = 0.2)

In [76]:
X_train.shape

(9600,)

In [77]:
Y_train.shape

(9600,)

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_trian_tfidf = tfidf.fit_transform(X_train).toarray()

X_test_tfidf = tfidf.transform(X_test).toarray()

In [83]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow = GaussianNB().fit(X_train_bow,Y_train)
nb_model_tfidf = GaussianNB().fit(X_trian_tfidf,Y_train)

In [88]:
from sklearn.metrics import confusion_matrix,accuracy_score

y_pred_bow =nb_model_bow.predict(X_test_bow)

In [90]:
y_pred_tfidf =nb_model_tfidf.predict(X_test_tfidf)

In [93]:
confusion_matrix(Y_test,y_pred_bow)

array([[2400]])

In [92]:
print("BOW accuracy:" ,accuracy_score(Y_test,y_pred_bow))

BOW accuracy: 1.0
