# Loading and Preprocessing Data

In [2]:
import pandas as pd
# Reading JSON file into a DataFrame
df = pd.read_json( 'Arts_Crafts_and_Sewing_5.json',lines=True)
# Displaying a random sample of 10 rows from the DataFrame
df.sample(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
419026,4,True,"12 10, 2017",A15Y0YKCS6HGY5,B0007ZFXXM,,Karen S. Kuhn,Works great but don't be a dork and forget to ...,Works great but don't be a dork and forget to ...,1512864000,,
369796,4,True,"06 8, 2016",A3PQPFIKAXWT9Y,B00USK61JG,,J. Stumpfel,I've found these folders to be hit or miss. T...,"Pretty, but...",1465344000,2.0,
451484,5,True,"02 28, 2018",A16CIJIFCWTLDR,B007IT72JS,,Idaho customer,A lot of craft twine for the price. Would buy...,Five Stars,1519776000,,
305845,5,True,"10 31, 2016",A1FV2BTGRS0N5P,B00E1V0AMW,,Mrs. B,Amazing needles!! I have tried them all and t...,Great Purchase!!,1477872000,,
485590,5,True,"11 9, 2015",A33WVG6HEB57ZH,B00ZA2GHNM,,Enid Reeves,I'm very pleased with this die. It cuts beauti...,Five Stars,1447027200,,
250314,5,True,"02 26, 2015",A2IRIRJTKJLKSW,B00728ZBNE,,Doc,Just perfect what I wanted.,Just perfect what I wanted.,1424908800,,
180614,2,True,"04 15, 2015",A25L8W25QMVNY1,B002EZTO7O,,Silvertabby,"They seem like nice bobbins, but don't fit the...",They seem like nice bobbins,1429056000,,
53732,1,True,"05 10, 2016",A31EXR76DVXSL6,B000YFMU0O,{'Color:': ' Silver'},Rita Schmeichel,WOULD NOT SPRAY OUT!,WOULD NOT SPRAY OUT!,1462838400,,
371022,4,True,"01 5, 2016",A34NG1606O9337,B00V3L1JOQ,,georgia,I love the intensity of these colors in fact I...,I love the intensity of these colors in fact I...,1451952000,,
341098,3,True,"09 28, 2015",A3GFMPW6SNFXHY,B00KZEJ14W,,Lucinda,The jump rings are fine. It's the container th...,Container cheaply made,1443398400,2.0,


# Sentiment Analysis Setup

In [3]:
# Importing the preprocessing module from the scikit-learn library
from sklearn import preprocessing
# Importing the Natural Language Toolkit (NLTK)
import nltk
# Downloading the opinion lexicon dataset from NLTK
nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

# Sentiment Analysis Function

In [4]:
# Importing the opinion_lexicon dataset from the NLTK corpus module
from nltk.corpus import opinion_lexicon
# Importing the word_tokenize function from the NLTK tokenize module
from nltk.tokenize import word_tokenize

In [5]:
# Printing the total number of words in the opinion_lexicon dataset
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
# Printing examples of positive words in the opinion_lexicon dataset
print('Examples of positive words in opinion lexicon',opinion_lexicon.positive()[:10])
# Printing examples of negative words in the opinion_lexicon dataset
print('Examples of negative words in opinion lexicon',opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


# Applying Sentiment Analysis

In [6]:
# Downloading the 'punkt' tokenizer from NLTK, which is required for word tokenization
nltk.download('punkt')
# Renaming the column 'reviewText' to 'text' for consistency and clarity
df.rename(columns={"reviewText": "text"}, inplace=True)
# Assigning positive and negative scores for sentiment analysis
pos_score = 1
neg_score = -1
# Creating an empty dictionary to store word scores for sentiment analysis
word_dict = {}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Exploration

In [7]:
# Iterating through each positive word in the opinion lexicon and adding it to the word dictionary
for word in opinion_lexicon.positive():
    word_dict[word] = pos_score

In [8]:
# Iterating through each negative word in the opinion lexicon and adding it to the word dictionary
for word in opinion_lexicon.negative():
    word_dict[word] = neg_score

In [9]:
# Define a function to calculate the sentiment score of a given text using Bing Liu's lexicon.
def bing_liu_score(text):
    sentiment_score = 0 # Initialize the sentiment score to zero.
    bag_of_words = word_tokenize(text.lower()) # Tokenize the text into individual words and convert to lowercase.
# Iterate over each word in the bag of words.
    for word in bag_of_words:
        # Check if the word exists in the word dictionary.
        if word in word_dict:
            sentiment_score += word_dict[word] # Add the sentiment score of the word to the overall sentiment score.
            return sentiment_score # Return the final sentiment score of the text.

In [10]:
# Fill missing values in the 'text' column with 'no review'
df['text'].fillna('no review', inplace=True)
# Apply the bing_liu_score function to calculate sentiment scores for each text in the 'text' column and create a new column 'Bing_Liu_Score'
df['Bing_Liu_Score'] = df['text'].apply(bing_liu_score)

In [13]:
# Displaying the first 10 rows of selected columns ('overall', 'text', 'Bing_Liu_Score')
df[['overall',"text", 'Bing_Liu_Score']].head(10)

Unnamed: 0,overall,text,Bing_Liu_Score
0,4,Contains some interesting stitches.,1.0
1,5,I'm a fairly experienced knitter of the one-co...,1.0
2,4,Great book but the index is terrible. Had to w...,1.0
3,5,I purchased the Kindle edition which is incred...,1.0
4,5,Very well laid out and very easy to read.\n\nT...,1.0
5,5,"Beginning her career as a freelance knitter, M...",1.0
6,5,This is a terrific stitch handbook (and I have...,1.0
7,4,The book needs to be coil bound. The content i...,1.0
8,5,I really am enjoying this book! I like the siz...,1.0
9,5,Just received this book and looked over it cov...,1.0


In [14]:
# Grouping the DataFrame by the 'overall' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('overall').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
overall,Unnamed: 1_level_1
1,-0.139887
2,0.174485
3,0.458573
4,0.770115
5,0.888414
