### Importing necessary libraries

In [32]:
import pandas as pd
import numpy as np

# Data Loading

In [33]:
df = pd.read_csv("Reviews.csv")  

### First Few Rows

In [34]:
# Display the first few rows
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Data Structure

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


### Columns

In [36]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

### Shape

In [37]:
df.shape

(568454, 10)

### Data Description

In [38]:
df.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


## Checking for missing values

In [39]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [40]:
# Word Cloud


# Data Preprocessing

### Handling Negation Properly
Instead of outright removing such words, you can process them to preserve their sentiment-altering role. Here's how:

- a. Keep Negations Intact
Retain negation words during preprocessing to ensure their influence on sentiment is captured.

- b. Negation Handling with Adjacent Words
You can append the negation word to the following term (e.g., not good â†’ not_good). This helps models treat it as a unique sentiment word.

In [29]:
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the negation handling function
def handle_negations(text):
    # Replace specific negation contractions with expanded forms
    text = re.sub(r"\b(can't|cannot)\b", "can_not", text)
    text = re.sub(r"\b(won't)\b", "will_not", text)
    text = re.sub(r"\b(don't)\b", "do_not", text)
    text = re.sub(r"\b(doesn't)\b", "does_not", text)
    text = re.sub(r"\b(didn't)\b", "did_not", text)
    text = re.sub(r"\b(haven't)\b", "have_not", text)
    text = re.sub(r"\b(hadn't)\b", "had_not", text)
    text = re.sub(r"\b(wouldn't)\b", "would_not", text)
    text = re.sub(r"\b(shouldn't)\b", "should_not", text)
    text = re.sub(r"\b(mustn't)\b", "must_not", text)
    text = re.sub(r"\b(mightn't)\b", "might_not", text)
    text = re.sub(r"\b(needn't)\b", "need_not", text)

    # Combine general "not" with the next word if applicable
    text = re.sub(r"\b(not|no|nor)\s+(\w+)", r"not_\2", text)
    
    return text

# Define the main text cleaning function
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Apply negation handling before removing punctuation
    text = handle_negations(text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespaces
    text = text.strip()
    
    # Tokenize text
    words = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    # Add custom stop words specific to your dataset
    custom_stopwords = set(['product', 'amazon', 'would', 'one', 'also', 'could', 'like', 'get', 'use', 'really', 'good', 'great'])
    all_stopwords = stop_words.union(custom_stopwords)
    
    words = [word for word in words if word not in all_stopwords]
    
    # Rejoin words
    cleaned_text = ' '.join(words)
    
    return cleaned_text

# Apply the cleaning function to the 'Text' column
df['Cleaned_Text'] = df['Text'].apply(clean_text)

# Preview the cleaned text
print(df[['Text', 'Cleaned_Text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yeshw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                Text  \
0  I have bought several of the Vitality canned d...   
1  Product arrived labeled as Jumbo Salted Peanut...   
2  This is a confection that has been around a fe...   
3  If you are looking for the secret ingredient i...   
4  Great taffy at a great price.  There was a wid...   

                                        Cleaned_Text  
0  bought several vitality canned dog food produc...  
1  arrived labeled jumbo salted peanutsthe peanut...  
2  confection around centuries light pillowy citr...  
3  looking secret ingredient robitussin believe f...  
4  taffy price wide assortment yummy taffy delive...  


In [31]:
df[["Text", "Cleaned_Text"]].head(25)

Unnamed: 0,Text,Cleaned_Text
0,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,Product arrived labeled as Jumbo Salted Peanut...,arrived labeled jumbo salted peanutsthe peanut...
2,This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,Great taffy at a great price. There was a wid...,taffy price wide assortment yummy taffy delive...
5,I got a wild hair for taffy and ordered this f...,got wild hair taffy ordered five pound bag taf...
6,This saltwater taffy had great flavors and was...,saltwater taffy flavors soft chewy candy indiv...
7,This taffy is so good. It is very soft and ch...,taffy soft chewy flavors amazing definitely re...
8,Right now I'm mostly just sprouting this so my...,right im mostly sprouting cats eat grass love ...
9,This is a very healthy dog food. Good for thei...,healthy dog food digestion small puppies dog e...


In [12]:
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define a function for text cleaning
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra whitespaces
    text = text.strip()
    
    # Tokenize text
    words = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    # Add custom stop words specific to your dataset
    custom_stopwords = set(['product', 'amazon', 'would', 'one', 'also', 'could', 'like', 'get', 'use', 'really', 'good', 'great'])
    all_stopwords = stop_words.union(custom_stopwords)
    
    words = [word for word in words if word not in all_stopwords]
    
    # Rejoin words
    cleaned_text = ' '.join(words)
    
    return cleaned_text

# Apply the cleaning function to the 'Text' column
df['Cleaned_Text'] = df['Text'].apply(clean_text)

# Preview the cleaned text
print(df[['Text', 'Cleaned_Text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yeshw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                Text  \
0  I have bought several of the Vitality canned d...   
1  Product arrived labeled as Jumbo Salted Peanut...   
2  This is a confection that has been around a fe...   
3  If you are looking for the secret ingredient i...   
4  Great taffy at a great price.  There was a wid...   

                                        Cleaned_Text  
0  bought several vitality canned dog food produc...  
1  arrived labeled jumbo salted peanutsthe peanut...  
2  confection around centuries light pillowy citr...  
3  looking secret ingredient robitussin believe f...  
4  taffy price wide assortment yummy taffy delive...  


### Stemming and Lemmatization
Applying lemmatization can help reduce words to their base forms, which can further improve the quality of the word cloud and analysis

In [13]:
from nltk.stem import WordNetLemmatizer

# Download WordNet data if not already downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def clean_text_lemmatization(text):
    # Previous cleaning steps
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    words = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    custom_stopwords = set(['product', 'amazon', 'would', 'one', 'also', 'could', 'like', 'get', 'use', 'really', 'good', 'great'])
    all_stopwords = stop_words.union(custom_stopwords)
    words = [word for word in words if word not in all_stopwords]
    
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    cleaned_text = ' '.join(words)
    
    return cleaned_text

# Apply the cleaning function with lemmatization
df['Cleaned_Text'] = df['Text'].apply(clean_text_lemmatization)

# Preview the cleaned text
print(df[['Text', 'Cleaned_Text']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yeshw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yeshw\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                                Text  \
0  I have bought several of the Vitality canned d...   
1  Product arrived labeled as Jumbo Salted Peanut...   
2  This is a confection that has been around a fe...   
3  If you are looking for the secret ingredient i...   
4  Great taffy at a great price.  There was a wid...   

                                        Cleaned_Text  
0  bought several vitality canned dog food produc...  
1  arrived labeled jumbo salted peanutsthe peanut...  
2  confection around century light pillowy citrus...  
3  looking secret ingredient robitussin believe f...  
4  taffy price wide assortment yummy taffy delive...  


### Benefits of Keeping Negations
- Improved Model Performance:
Retaining negations preserves the polarity of the sentiment, improving sentiment classification accuracy.
- Enhanced Word Embeddings:
Models like Word2Vec or TF-IDF can capture "not_good" as a single term, distinct from "good".
- BERT and Advanced Models:
Context-aware models like BERT naturally handle negations better when they are kept in the text.

In [None]:
from collections import Counter

# Combine all cleaned text
all_words = ' '.join(df['Cleaned_Text']).split()

# Get word frequencies
word_freq = Counter(all_words)

# Define frequency thresholds
min_freq = 5  # Adjust based on your dataset
max_freq = 10000

# Filter words in 'Cleaned_Text'
def filter_freq_words(text):
    words = text.split()
    filtered_words = [word for word in words if min_freq <= word_freq[word] <= max_freq]
    return ' '.join(filtered_words)

df['Cleaned_Text'] = df['Cleaned_Text'].apply(filter_freq_words)
