In [108]:
import csv
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
#nltk.download('averaged_perceptron_tagger')

In [109]:
#function that loads a lexicon of positive words to a set and returns the set
def loadLexicon(fname):
    newLex=set()
    lex_conn=open(fname)
    
    #add every word in the file to the set
    for line in lex_conn:
        newLex.add(line.strip())# remember to strip to remove the lin-change character
    lex_conn.close()

    return newLex


In [133]:
def getOpinions(input_file, feature_num):
    
    #load the positive and negative lexicons into sets
    posLex=loadLexicon('positive-words.txt')
    negLex=loadLexicon('negative-words.txt')

    noun_sentiment={}#maps each noun to the number of times it appears in the same sentence as a positive or negative word
        
    fin=open(input_file,encoding='utf8')

    reader=csv.reader(fin)
    
    for line in reader: # for each review

        text,rating=line # get the text and rating
    
        sentences=sent_tokenize(text) # split the review into sentences

        for sentence in sentences: # for each sentence
            
            words=word_tokenize(sentence) # split the review into words # word_tokenize allows you to separate words from puncuation
        
            tagged_words=nltk.pos_tag(words) # POS tagging for the words in the sentence
    
            nouns_in_sentence=set() # set of all the nouns in the sentence
        
            sentiment_word_count=0 # number of positive or negative words in the sentence
        
            #https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    
            for tagged_word in tagged_words:
            
                if tagged_word[1].startswith('NN'): # if the word is a noun

                    noun=tagged_word[0].lower() # lower case the noun
                    
                    if len(noun)<3:continue # ignore nouns with less than 3 characters
                                
                    nouns_in_sentence.add(noun) # add the noun to the set
                
                if tagged_word[1].startswith('JJ') and (tagged_word[0] in posLex or tagged_word[0] in negLex): 
                    sentiment_word_count+=1
                  
                    
                    
                prev_word=tagged_word
        
            for noun in nouns_in_sentence: # for each noun that we found in the sentence
                noun_sentiment[noun]=noun_sentiment.get(noun,0)+sentiment_word_count

    fin.close()

    #sort noun based on their total polarity counts (pos+neg)
    # sorting the nouns so that highest sentiment count is at the top
    sorted_nouns=sorted(noun_sentiment.items(),key=lambda x:x[1],reverse=True)

    
    #get the top feature_num features
    top=sorted_nouns[:feature_num]

    return top

In [134]:
result=getOpinions('amazonreviews.csv',100)

for noun in result:
    print(noun)
    
    
    

('sound', 147)
('headphones', 139)
('quality', 92)
('anc', 83)
('sennheiser', 72)
('bose', 61)
('bass', 46)
('noise', 41)
('case', 37)
('momentum', 36)
('music', 35)
('sony', 34)
('battery', 32)
('app', 31)
('way', 27)
('head', 25)
('bit', 24)
('features', 22)
('bluetooth', 20)
('set', 19)
('leather', 19)
('feature', 18)
('pair', 17)
('cancelling', 16)
('comfort', 16)
('everything', 16)
('cancellation', 16)
('volume', 16)
('calls', 16)
('highs', 16)
('hearing', 16)
('time', 15)
('design', 15)
('range', 15)
('ears', 15)
('office', 14)
('thing', 14)
('button', 14)
('life', 14)
('ear', 14)
('price', 14)
('buttons', 13)
('controls', 13)
('soundstage', 13)
('mids', 13)
('on/off', 13)
('px7', 13)
('headphone', 12)
('use', 12)
('headband', 12)
('anything', 12)
('phone', 11)
('lot', 11)
('pause', 11)
('audio', 11)
('build', 11)
('call', 11)
('connectivity', 11)
('microphone', 11)
('people', 11)
('sounds', 11)
('devices', 10)
('cups', 10)
('isolation', 10)
('opinion', 10)
('voice', 10)
('modes'

"""
The function should accept 3 parameters:
input_path: the path to the   'amazonreviews.csv' file that we also used in class.
index1: the row number that corresponds to a specific review in the given csv (first row would have index1=0)
index2: the row number that corresponds to another specific review in the given csv

The function should return a list of all nouns for which the review at index1 expresses the opposite opinion than the review at index2.
You should compute the opinion that a review expresses on a noun as follows:

Let P be the number of positive words that appear in the same sentence \
as the noun in the review

Let N be the number of negative words that appear in the same sentence \
as the noun in the review
If P>N, then the opinion of the review on the noun is positive.
If P<N, then the opinion of the review on the noun is negative
If P==N, then the opinion of the review on the noun is neutral.
"""

In [144]:
def parse(input_path,index1,index2):

    #load the positive and negative lexicons into sets
    posLex=loadLexicon('positive-words.txt')
    negLex=loadLexicon('negative-words.txt')

    noun_sentiment={}  #maps each noun to the number of times it appears in the same sentence as a positive or negative word
        
    fin=open(input_path,encoding='utf8')

    reader=csv.reader(fin)
    
    for line in reader: # for each review

        text,rating=line # get the text and rating
    
        sentences=sent_tokenize(text) # split the review into sentences

        for sentence in sentences: # for each sentence
            
            words=word_tokenize(sentence) # split the review into words # word_tokenize allows you to separate words from puncuation
        
            tagged_words=nltk.pos_tag(words) # POS tagging for the words in the sentence
    
            nouns_in_sentence=set() # set of all the nouns in the sentence
        
            sentiment_word_count=0 # number of positive or negative words in the sentence
        
            #https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    
            for tagged_word in tagged_words:
            
                if tagged_word[1].startswith('NN'): # if the word is a noun

                    noun=tagged_word[0].lower() # lower case the noun
                    
                    if len(noun)<3:continue # ignore nouns with less than 3 characters
                                
                    nouns_in_sentence.add(noun) # add the noun to the set
                
                if tagged_word[1].startswith('JJ') and (tagged_word[0] in posLex or tagged_word[0] in negLex): 
                    sentiment_word_count+=1
                  
                    
                    
                prev_word=tagged_word
        
            for noun in nouns_in_sentence: # for each noun that we found in the sentence
                noun_sentiment[noun]=noun_sentiment.get(noun,0)+sentiment_word_count

    fin.close()

    #sort noun based on their total polarity counts (pos+neg)
    # sorting the nouns so that highest sentiment count is at the top
    sorted_nouns=sorted(noun_sentiment.items(),key=lambda x:x[1],reverse=True)

    # slice to show first index and second index
    first=sorted_nouns[index1]
    second=sorted_nouns[index2]
    sentiment = []
    if first[1] < second[1]:
        sentiment = first
    elif first[1] > second[1]:
        sentiment = second
    elif first[1] == second[1]:
        sentiment = zip(first,second)
    return sentiment

In [145]:
# Test case for neutral
result=parse('amazonreviews.csv',19,20)

for noun in result:
    print(noun)

('set', 'leather')
(19, 19)


In [146]:
# Test case for positive
result=parse('amazonreviews.csv',0,1)

for noun in result:
    print(noun)

headphones
139


In [147]:
# Test case for negative
result=parse('amazonreviews.csv',0,100)

for noun in result:
    print(noun)

device
7
