# Introduction to Pandas

In [8]:
import pandas as pd

# Reading CSV File into a Pandas DataFrame

In [9]:
# Reading CSV file into a DataFrame
df=pd.read_csv("/Users/yathisha/Downloads/AmazonReviews.csv")
# Displaying a random sample of 20 rows from the DataFrame
df.sample(20)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
244174,244175,B0018CIUBA,AGPDKQIFYQ1YK,"Quick Vic ""Quick Vic""",0,2,3,1326931200,Dog loves 'em,Got these but they shorted me on the number th...
99173,99174,B000UXW95G,AFA92I5WLZ1AG,"E. Yberg ""Ed in Halesite, NY""",0,0,5,1284940800,delicious and convenient...,"not only are these noodles delicious, they fit..."
391640,391641,B006G7WC1G,A3L6YSYB1ITURN,JDrive,0,0,5,1347235200,"Ja, mon!",Big fat beans roasted just right....easy to ge...
53638,53639,B0016687F2,ACQLJJ7MVNVMH,Vegidude,0,0,4,1306022400,Lean Body Shake - Strawberry,Great tasting product with very good mixabilit...
311722,311723,B000EVG8H4,A1S0QN1U07RPIO,Rad Reader,5,5,5,1267228800,Too Yummy to be GF!!,We have tried many GF breads ... both prepared...
124748,124749,B0045X7H9A,A3V1A3C9DTLPME,Peter Faden,3,4,4,1311292800,Works well and tasted pretty good.,I bought these after reading some of the other...
316644,316645,B004FEN3GK,A8WRN6S6XGAG9,"Brockeim ""Playful Literary Adventurer""",0,0,5,1300838400,The Same Crackers Found in Campfire S'Mores,"Lingering evenings by campfires, sitting on ol..."
564689,564690,B00017LF24,A13FTUPT8BPSDL,linda947,0,0,5,1346544000,Diamond Crystal Popcorn and Nut Salt,I love this salt! It is absolutely excellent o...
246512,246513,B002D4DY8G,A2RX62V4E2BF5Z,"Celeste ""Vodka, Apple Pucker & Sweet 'n' Sour""",2,2,5,1260662400,Tasty,I wish Gevalia came with an option for whole b...
540323,540324,B000C5YWZW,A2EDB3JN5NR01A,Brandon R.,10,10,1,1320969600,Vanilla beans from Halaleveryday were dry and ...,I purchased 6 vanilla beans from Halaleveryday...


In [10]:
#!pip install scikit-learn

# Importing Modules from scikit-learn and NLTK

In [11]:
# Importing the preprocessing module from the scikit-learn library
from sklearn import preprocessing
# Importing the Natural Language Toolkit (NLTK)
import nltk
# Downloading the opinion lexicon dataset from NLTK
nltk.download('opinion_lexicon')
# Importing the opinion_lexicon dataset from the NLTK corpus module
from nltk.corpus import opinion_lexicon
# Importing the word_tokenize function from the NLTK tokenize module
from nltk.tokenize import word_tokenize
# Printing the total number of words in the opinion_lexicon dataset
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
# Printing examples of positive words in the opinion_lexicon dataset
print('Examples of positive words in opinion lexicon',
      opinion_lexicon.positive()[:10])
# Printing examples of negative words in the opinion_lexicon dataset
print('Examples of negative words in opinion lexicon',
      opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/yathisha/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


# Assigning Positive and Negative Scores

In [12]:
# Let's create a dictionary which we can use for scoring our review text

# Downloading punkt from NLTK library
nltk.download('punkt')

# Renaming the column 'reviewText' to 'Modules' in the DataFrame
df.rename(columns={"reviewText": "Modules"}, inplace=True)

# Assigning positive and negative scores
pos_score = 1
neg_score = -1

# Initializing an empty dictionary
word_dict = {}
 
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
        word_dict[word] = pos_score
      
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
        word_dict[word] = neg_score

[nltk_data] Downloading package punkt to /Users/yathisha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Initializing Sentiment Score

In [13]:
#Creating a fuunction text
def bing_liu_score(Modules):
    #Initializing the sentiment score
    sentiment_score = 0
    #Tokenizing the input text into words and convert them to lowercase
    bag_of_words = word_tokenize(Modules.lower())
     # creating loop to check each word in the bag of words
    for word in bag_of_words:
        #Checking if the word exists in the sentiment dictionary
        if word in word_dict:
            # If the word exist, adding its sentiment score to the sentiment score
            sentiment_score += word_dict[word]
    return sentiment_score  #Returning the sentiment score for the text

# Handling Missing Values in the DataFrame

In [14]:
# Fill NaN values in the 'Summary' column
df['Summary'].fillna('no review', inplace=True)
#creating new column 'Bing_Liu_Score' to store the scores by applying  bing_liu_score to calculate sentiment scores for Review column
df['Bing_Liu_Score'] = df['Summary'].apply(bing_liu_score)

# Showing the First 5 Rows with Specific Columns

In [15]:
# Displaying the first 5 rows of the DataFrame with few columns
df[['Summary',"Id","Summary", 'Bing_Liu_Score']].head(5)

Unnamed: 0,Summary,Id,Summary.1,Bing_Liu_Score
0,Good Quality Dog Food,1,Good Quality Dog Food,1
1,Not as Advertised,2,Not as Advertised,0
2,"""Delight"" says it all",3,"""Delight"" says it all",1
3,Cough Medicine,4,Cough Medicine,0
4,Great taffy,5,Great taffy,1


# Calculating Mean Sentiment Score 

In [16]:
# Grouping the DataFrame by the 'Overall_Rating' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('Summary').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
Summary,Unnamed: 1_level_1
! It's HOT !,1.0
! Mmmmmmmmmm !,0.0
! WARNING ! Not for those who can't handle the heat!,-1.0
!! NASTY !!,-1.0
!!!,0.0
...,...
~~~TRIO OF ALL TIME FAVORITES~~~,0.0
«:::D:::» «:::E:::» «:::L:::» «:::I:::» «:::C:::» «:::I:::» «:::O:::» «:::U:::» «:::S:::»,0.0
"«:::G:::» «:::R:::» «:::E:::» «:::A:::» «:::T:::» ""Everyday"" Hot Sauce!",1.0
"½ pounds, not 1.5",0.0


# Calculating Mean Sentiment Score for Each Group based on ID

In [17]:
# Grouping the DataFrame by the 'Overall_Rating' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('Id').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
Id,Unnamed: 1_level_1
1,1.0
2,0.0
3,1.0
4,0.0
5,1.0
...,...
568450,0.0
568451,-1.0
568452,1.0
568453,2.0
