# Sentiment Analysis
## We'll feed the neural network with movie reviews and positive/negative labels and then see if it is able to learn and predict whether a particular review/text is positive or negative
### Let us first load the data and store it in lists


In [1]:
#Curating Dataset
file=open("reviews.txt",'r')
reviews=list(map(lambda x:x[:-1],file.readlines()))
file.close()
file = open('labels.txt','r') 
labels = list(map(lambda x:x[:-1].upper(),file.readlines()))
file.close()

# The neural network
## We'll define a class for the network,
### - The constructor preprocesses and initializes the data/variables.  
### - For reducing the noise in data (in this case), 
   - We try to eliminate words from our training vocabulary that do not provide much sentiment(or predictive power).
     For example,words such as "the","is","and",etc.
   - We also try to remove words that occur rarely in the reviews,since their quantity is insufficient to train.
     
   >For these purposes,we use positive to negative ratios,and for that we count the number of times a word appears in both 
 posivite and negative reviews,and store them separately.We also calculate the total count of a word in the reviews.
 Then if the word occurs more than a particular threshold we calculate its positive to negative ratio.
 We add words to the vocabulary that occur more than the minimum count,hence eliminating words that are less frequent and if 
 the word is present in our positive to negative ratios we add only those that have value greater than our cutoff,hence 
 eliminating the common words with not much predictive power.
   - The input layer only storing whether the word is present or not,and not the number of times it occur. 
     Since most common words in the review are just " ","is",etc.,and if we multiply their occurance with the weights,
     it is probably a bad idea.
     
### - For improving the efficiency of the network,

   - We take only those words from a particular review for consideration during running/forward propagation that are present in
     our vocab.
   - for our hidden layer,we consider the weights for only the non-zero items and while training we only update the weights that 
     were used while propagating forward
     
   >Hence,reducing the time consumed.

In [2]:
import numpy as np
from collections import Counter
class SentiNetwork:
    def __init__(self,reviews,labels,polarity_cutoff = 0.1,min_count = 10,hidden_nodes=10,alpha=0.1):
        np.random.seed(1)
        self.preprocess(reviews,labels,polarity_cutoff,min_count)
        self.inputnodes=len(self.r_vocab)
        self.hidden_nodes=hidden_nodes
        self.alpha=alpha
        self.output_nodes=1
        self.w_ih=np.zeros((self.inputnodes,self.hidden_nodes))
        self.w_ho=np.random.normal(0.0,self.output_nodes**-0.5,(self.hidden_nodes,self.output_nodes))
        self.lay_1=np.zeros((1,self.hidden_nodes))
    
    def preprocess(self,reviews,labels,polarity_cutoff, min_count):
       
        
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term,cnt in list(total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
     
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
        self.r_vocab=list(review_vocab)
        lvocab=set()
        for l in labels:
            lvocab.add(l)
        self.l_vocab=lvocab
        
        self.r_vocabsize=len(self.r_vocab)
        self.l_vocabsize=len(self.l_vocab)
        
        self.w2index={} #for storing the index of the word in the vocabulary
        for i,w in enumerate(self.r_vocab):
            self.w2index[w]=i
        self.l2index={}
        for i,l in enumerate(self.l_vocab):
            self.l2index[l]=i
    def update_l1(self,review): #adding only weights of non-zero items
        self.lay_1*=0
        for ind in review:
             self.lay_1+=self.w_ih[ind]
    def get_t_l(self,label):
        if label=="POSITIVE":
            return 1
        elif label=="NEGATIVE":
            return 0
    def s(self,x):
        return 1/(1+np.exp(-x))
    def sd(self,o):
        return o*(1-o)
    def train(self,treviews_r,tlabels):
        i=0
        treviews=list()
        for r in treviews_r:
            ind=set()
            for w in r.split(" "):
                if w in self.w2index.keys():
                    ind.add(self.w2index[w])
            treviews.append(list(ind))
        
        assert(len(treviews)==len(tlabels))
        c=0
        for r,l in zip(treviews,tlabels):
            
            self.update_l1(r)
            output=self.s(self.lay_1.dot(self.w_ho))
            
            error=self.get_t_l(l)-output
            errordelta=error*self.sd(output)
            errorhid=errordelta.dot(self.w_ho.T)
            
            
            for ind in r:
                self.w_ih[ind]+=errorhid[0]*self.alpha #ignoring the x in the equation since x=1
            self.w_ho+=self.lay_1.T.dot(errordelta)*self.alpha
            
            i+=1
            if output>=0.5 and l=="POSITIVE":
                c+=1
            elif output<0.5 and l=="NEGATIVE":
                c+=1
            if i>0 and i%(i/10)==0:
                print("\rAccuracy : "+str(c * 100 / float(i+1))[:4]+" Trained percent : "+ str(100 * i/float(len(treviews)))[:4],end=" ")    
    
                  
    def test(self,te_r,te_l):
        c,i=0,0
        for x,y in zip(te_r,te_l):
            pred=self.run(x)
            i+=1
            if pred==y:
                c+=1
            if i>0 and i%(i/10)==0:
                print("\r Accuracy: "+str(c * 100 / float(i+1))[:4]+" Completion % : "+ str(100 * i/float(len(te_r)))[:4],end=" ")   
    def run(self,review):
        self.lay_1*=0
        ind=set()
        for w in review.lower().split(" "):
            if w in self.w2index.keys():
                ind.add(self.w2index[w])
            #output hidden ,since we take 1 if item present ,therefore,considering only those present:
        self.update_l1(ind)
        output=self.s(self.lay_1.dot(self.w_ho))
        if output[0] >= 0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"


## Let's now train the model,
### We'll reserve last 1000 reviews for test and we'll train with remaining.
#### the "*3" is basically the epoch.


In [3]:
nt = SentiNetwork(reviews[:-1000],labels[:-1000], alpha=0.1)
nt.train(reviews[:-1000]*3,labels[:-1000]*3)

Accuracy : 89.4 Trained percent : 100.                                                                                                       

# Now testing the network

In [4]:
nt.test(reviews[-1000:],labels[-1000:])

 Accuracy: 85.5 Completion % : 100. 

## Now let's test our network with some random reviews.
### For this, I have grabbed a snippet of the review for The Wolf of Wall Street with a rating of 9/10 from IMDB.Let's see what the network predicts:

In [5]:
st="""Scorsese has always managed to elicit astounding performances from his actors, and his fifth collaboration with Leonardo DiCaprio results in one of the most charismatic, despicable, offensive and captivating characters to ever appear on screen. As financial bad boy Belfort, DiCaprio swaggers from scene to scene ingesting eye popping amounts of narcotics, groping and fondling nearly every female within reach, and spouting more profanity in three hours than an entire season of The Sopranos. Belfort is the kind of person that any sane person would detest in real life, but thanks to Scorses and DiCaprio, we cant take our eyes off him.
"""
nt.run(st)

'POSITIVE'

### It predicts correctly. Now let's check with a snippet of review with rating 3/10:

In [6]:
st2="""The plot is nothing original, the rise and fall of a powerful man, have we not seen that so many times before ? Similar movies include Blow, Scarface and even the Spanish film starring Javier Bardem called Golden Balls. Overlong run time filled with lame rubbish over the top sex and drug scenes that seemed to take forever. I really cannot find anything good about this film, except a few laugh out loud scenes and a promising start that took a strong curve downwards as the movie progressed, 2 1/2 hours of torture."""
nt.run(st2)

'NEGATIVE'

### and it predicts correctly again.