# Loading and Preprocessing Data

In [1]:
import pandas as pd

# Reading CSV file

In [3]:
# Reading CSV file into a DataFrame
df=pd.read_csv("C:/Users/DELL/Desktop/TextBasedAnalysis/Arts_Crafts_and_sewing_5.csv")
# Displaying a random sample of 20 rows from the DataFrame
df.sample(20)

Unnamed: 0,overall,verified,reviewerID,asin,reviewText,summary
271839,5,True,A1LGR60X7C7XK9,B009LIOFYY,great price,Five Stars
429635,5,True,A2K8W77RIPF169,B001CE5DPK,awesome product,Five Stars
49244,4,True,A3V8LM88C2H6KO,B000XZW32K,"Nice stencils. Not super thick, but they work....","Not super thick, but they work"
40791,5,True,A1IG3JJUDX1IZK,B000W5R6UA,Best on the market.,recommended
240681,5,True,ASX2QEYZICOKZ,B005V9V6L2,Great Pens....Beautiful Color.....Smooth to co...,Five Stars
278831,5,True,A3LUOEPUGF50VO,B00AWMFR86,love it. Thanks,Five Stars
83276,5,True,A39GJJT0H2BZH9,B00177X9OO,Bought these for my first foray into these typ...,Good quality for a reasonable price
48306,4,True,A3F1798DQORRM1,B000XZTOCC,I would buy again. Good yarn for projects.,Good yarn for projects
290080,5,True,A18D1949K5C029,B00BWJCCI6,"I absolutely love this machine, I stubbled acr...",LOVE
491,5,True,A32UJ87I6YGQMJ,B00002X2FX,"good product, great price",Five Stars


In [4]:
#!pip install scikit-learn

# Sentiment Analysis Setup

In [5]:
# Importing the preprocessing module from the scikit-learn library
from sklearn import preprocessing
# Importing the Natural Language Toolkit (NLTK)
import nltk
# Downloading the opinion lexicon dataset from NLTK
nltk.download('opinion_lexicon')
# Importing the opinion_lexicon dataset from the NLTK corpus module
from nltk.corpus import opinion_lexicon
# Importing the word_tokenize function from the NLTK tokenize module
from nltk.tokenize import word_tokenize
# Printing the total number of words in the opinion_lexicon dataset
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
# Printing examples of positive words in the opinion_lexicon dataset
print('Examples of positive words in opinion lexicon',
      opinion_lexicon.positive()[:10])
# Printing examples of negative words in the opinion_lexicon dataset
print('Examples of negative words in opinion lexicon',
      opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


# Applying Sentiment Analysis

In [6]:
# Let's create a dictionary which we can use for scoring our review text

# Downloading punkt from NLTK library
nltk.download('punkt')

# Renaming the column 'reviewText' to 'Modules' in the DataFrame
df.rename(columns={"reviewText": "Modules"}, inplace=True)

# Assigning positive and negative scores
pos_score = 1
neg_score = -1

# Initializing an empty dictionary
word_dict = {}
 
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
        word_dict[word] = pos_score
      
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
        word_dict[word] = neg_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Exploration

# Function to Calculate Sentiment Score using Bing Liu's Lexicon

In [7]:
#Creating a fuunction text
def bing_liu_score(Modules):
    #Initializing the sentiment score
    sentiment_score = 0
    #Tokenizing the input text into words and convert them to lowercase
    bag_of_words = word_tokenize(Modules.lower())
     # creating loop to check each word in the bag of words
    for word in bag_of_words:
        #Checking if the word exists in the sentiment dictionary
        if word in word_dict:
            # If the word exist, adding its sentiment score to the sentiment score
            sentiment_score += word_dict[word]
    return sentiment_score  #Returning the sentiment score for the text

# Data Preprocessing and Sentiment Analysis

In [8]:
# Fill NaN values in the 'text' column
df['Modules'].fillna('no review', inplace=True)
#creating new column 'Bing_Liu_Score' to store the scores by applying  bing_liu_score to calculate sentiment scores for Module column
df['Bing_Liu_Score'] = df['Modules'].apply(bing_liu_score)

# Displaying Selected Columns for Analysis

In [11]:
# Displaying the first 5 rows of the DataFrame with few columns
df[['overall','reviewerID',"Modules", 'Bing_Liu_Score']].head(5)

Unnamed: 0,overall,reviewerID,Modules,Bing_Liu_Score
0,4,AIE8N9U317ZBM,Contains some interesting stitches.,1
1,5,A3ECOW0TWLH9V6,I'm a fairly experienced knitter of the one-co...,22
2,4,A278N8QX9TY2OS,Great book but the index is terrible. Had to w...,0
3,5,A123W8HIK76XCN,I purchased the Kindle edition which is incred...,4
4,5,A2A6MZ2QB4AE0L,Very well laid out and very easy to read.\n\nT...,5


# Grouping and Aggregating Sentiment Scores

In [12]:
# Grouping the DataFrame by the 'overall' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('overall').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
overall,Unnamed: 1_level_1
1,-0.255049
2,0.566098
3,1.158796
4,2.027999
5,2.129986
