In [2]:
!pip install pandas numpy nltk scikit-learn tqdm




# Dataset Loading and Preparation 

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"C:\Users\HP\Downloads\cleaned_amazon_reviews_2017_2018.csv")  # replace with your actual file path
print(df.shape)        # Print number of rows and columns
print(df.columns)      # Print column names



(57449, 6)
Index(['asin', 'reviewText', 'summary', 'price', 'brand', 'title'], dtype='object')


In [4]:
df.head(5)     # Preview the first 5 records

Unnamed: 0,asin,reviewText,summary,price,brand,title
0,B000050FDY,Cleans the razor heads just fine. I found it a...,Razor cleanliness.,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...
1,B000050FDY,As described. Good price. Delivered on time.,Good price. Delivered on time,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...
2,B000050FDY,works as advertised,works as advertised,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...
3,B000050FDY,A little pricey for the short time they last.,They work well - a little pricey,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...
4,B000050FDY,Great Cleaning soultion,Five Stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...


In [5]:
# Drop entries with missing review text or star rating
df = df.dropna(subset=['reviewText', 'summary'])


In [6]:

# Drop duplicate reviews (exact text duplicates) to avoid repeat bias
df = df.drop_duplicates(subset=['reviewText', 'summary'])


In [7]:

# Convert review text to lowercase
df['reviewText'] = df['reviewText'].str.lower()



In [8]:
# Remove punctuation using regex: replace any non-word character (not letter, digit, underscore) with space
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\s]', ' ', regex=True)


In [9]:
# Convert summary column to lowercase
df['summary'] = df['summary'].str.lower()

# Mapping of text to numeric rating
star_mapping = {
    'one star': 1,
    'two stars': 2,
    'three stars': 3,
    'four stars': 4,
    'five stars': 5,
    'one stars': 1,    # In case of plural mismatch
    'two star': 2,
    'three star': 3,
    'four star': 4,
    'five star': 5,
}

# Function to extract numeric star rating from text
def extract_star_rating(text):
    for key in star_mapping:
        if key in text:
            return star_mapping[key]
    return None  # If no match, return None

# Apply the function to create the new column
df['star_review'] = df['summary'].apply(extract_star_rating)

# Drop rows where no star rating was found
df = df.dropna(subset=['star_review'])

# Optionally, convert star_review to int
df['star_review'] = df['star_review'].astype(int)

# Preview the result
print(df[['summary', 'star_review']].head())

      summary  star_review
4  five stars            5
5  five stars            5
6  five stars            5
7  four stars            4
8  five stars            5


In [10]:
# Map star ratings to categorical classes
def rating_to_class(rating):
    if rating in [1, 2]:
        return 'low'
    elif rating == 3:
        return 'medium'
    else:
        return 'high'

df['rating_class'] = df['star_review'].apply(rating_to_class)
print(df['rating_class'].value_counts())

rating_class
high      11580
low        2850
medium     1227
Name: count, dtype: int64


In [11]:
df.head(5)
df.shape
df.columns

Index(['asin', 'reviewText', 'summary', 'price', 'brand', 'title',
       'star_review', 'rating_class'],
      dtype='object')

In [12]:
df.head(10)

Unnamed: 0,asin,reviewText,summary,price,brand,title,star_review,rating_class
4,B000050FDY,great cleaning soultion,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high
5,B000050FDY,great product and price,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high
6,B000050FDY,great replacement kit,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high
7,B000050FDY,expensive but it works just as advertised,four stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,4,high
8,B000050FDY,excellent value,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high
9,B000050FDY,great product,four stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,4,high
10,B000050FDY,doesn t last long,three stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,3,medium
12,B000050FDY,thank you,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high
13,B000050FDY,used them all the time,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high
16,B000050FDY,fast and perfect,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high


# LIWC- like feature extraction

In [14]:
# Define LIWC-like dictionaries (word lists) for each category
i_words_list       = ["i", "me", "my", "mine", "myself"]
positive_words     = ["good", "great", "excellent", "amazing", "love", "fantastic", "happy", "satisfied"]
negative_words     = ["bad", "terrible", "awful", "poor", "hate", "disappointed", "angry", "worst"]
social_words       = ["we", "us", "our", "they", "their", "friends", "family", "everyone"]
cognitive_words    = ["because", "think", "know", "reason", "why", "understand", "thought"]
allure_words       = ["free", "win", "sale", "new", "now", "offer", "best", "guarantee", "exclusive"]
moralization_words = ["should", "must", "ought", "right", "wrong", "honor", "shame", "deserve", "honest"]


In [1]:
import numpy
import nltk
print("NumPy version:", numpy.__version__)
print("NLTK version:", nltk.__version__)


NumPy version: 1.24.3
NLTK version: 3.8.1


In [15]:
import nltk
# Ensure we have a tokenizer (using nltk's word_tokenize or simply split by whitespace)
nltk.download('punkt')  

def count_category_words(text, word_list):
    """Count how many words from word_list appear in the text."""
    # Simple tokenization by splitting on whitespace (text is already lowercased and punctuation removed)
    words = text.split()
    count = 0
    for w in words:
        if w in word_list:
            count += 1
    return count

# Alternatively, one could use nltk.word_tokenize(text) for more robust tokenization.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [17]:
from tqdm import tqdm
tqdm.pandas()  # enables progress_apply on pandas Series

# Create new columns in the DataFrame for each LIWC-like feature count
df['i_words_count']       = df['reviewText'].progress_apply(lambda x: count_category_words(x, i_words_list))
df['positive_tone_count'] = df['reviewText'].progress_apply(lambda x: count_category_words(x, positive_words))
df['negative_tone_count'] = df['reviewText'].progress_apply(lambda x: count_category_words(x, negative_words))
df['social_words_count']  = df['reviewText'].progress_apply(lambda x: count_category_words(x, social_words))
df['cognitive_proc_count']= df['reviewText'].progress_apply(lambda x: count_category_words(x, cognitive_words))
df['allure_count']        = df['reviewText'].progress_apply(lambda x: count_category_words(x, allure_words))
df['moralization_count']  = df['reviewText'].progress_apply(lambda x: count_category_words(x, moralization_words))


100%|██████████| 15657/15657 [00:00<00:00, 163056.97it/s]
100%|██████████| 15657/15657 [00:00<00:00, 204803.42it/s]
100%|██████████| 15657/15657 [00:00<00:00, 156164.31it/s]
100%|██████████| 15657/15657 [00:00<00:00, 146322.71it/s]
100%|██████████| 15657/15657 [00:00<00:00, 194808.15it/s]
100%|██████████| 15657/15657 [00:00<00:00, 154534.16it/s]
100%|██████████| 15657/15657 [00:00<00:00, 172736.81it/s]


In [19]:
df.head(20)

Unnamed: 0,asin,reviewText,summary,price,brand,title,star_review,rating_class,i_words_count,positive_tone_count,negative_tone_count,social_words_count,cognitive_proc_count,allure_count,moralization_count
4,B000050FDY,great cleaning soultion,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high,0,1,0,0,0,0,0
5,B000050FDY,great product and price,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high,0,1,0,0,0,0,0
6,B000050FDY,great replacement kit,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high,0,1,0,0,0,0,0
7,B000050FDY,expensive but it works just as advertised,four stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,4,high,0,0,0,0,0,0,0
8,B000050FDY,excellent value,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high,0,1,0,0,0,0,0
9,B000050FDY,great product,four stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,4,high,0,1,0,0,0,0,0
10,B000050FDY,doesn t last long,three stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,3,medium,0,0,0,0,0,0,0
12,B000050FDY,thank you,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high,0,0,0,0,0,0,0
13,B000050FDY,used them all the time,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high,0,0,0,0,0,0,0
16,B000050FDY,fast and perfect,five stars,$11.94,Braun,Braun Clean &amp; Renew Refill Cartridges CCR ...,5,high,0,0,0,0,0,0,0
