In [14]:
# The purpose of this notebook is to test the sentiment_mod.py 
# File which imports all the necessary data for the sentiment analysis
# through loaded pickles, classes and functions 

In [4]:
import nltk # Generic NLTK library
# sentence, work tokenizer and unsupervised sentence tokenizer which can be trained and implemented
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer 
# Stemming tool 
from nltk.stem import PorterStemmer
# Text of all the state of union speeches
from nltk.corpus import state_union
# Text of all stop words
from nltk.corpus import stopwords
# Lemmatizer
from nltk.stem import WordNetLemmatizer
# Frequence distribution
from nltk import FreqDist

import numpy as np
import scipy
import matplotlib.pyplot as plt

from nltk.classify.scikitlearn import SklearnClassifier

import random 
import io
import pickle

from nltk.classify import ClassifierI
from statistics import mode

In [5]:
# Writing a class to build an ensemble classifier
class VoteClassifier(ClassifierI):
    def __init__(self,*classifiers): # default method
        self._classifiers=classifiers
        
    def classify(self, features): # returns mode of votes
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features): # returns confidence: fraction of positive votes 
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
            
        choice_votes=votes.count(mode(votes))
        conf =choice_votes/len(votes)
        
        return conf

###  Loading Documents from pickles

In [None]:
# Back up code - in case if the pickle is not available

In [9]:
# ### Using an example of POS tagging based sentiment analysis

# import io # Harrison's code did not word to read
# # Got this format of importing text from stackoverflow
# short_pos=io.open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/positive.txt",encoding='latin-1').read()
# short_neg=io.open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/negative.txt",encoding='latin-1').read()

# documents=[]
# for r in short_pos.split('\n'): # splitting document by new line
#     documents.append((r,"pos")) # tuple- text,sentiment appended
# for r in short_neg.split('\n'):
#     documents.append((r,"neg"))


# # j is adjective, r is adverb and v is verb
# # allowed word types=["J","R","V"]
# allowed_word_types=["J"] # Only allowing adjectives
# all_words=[]
# for p in short_pos.split('\n'): # extracting all separate lines from positive tagged documents
#     documents.append((p,"pos")) # appending all lines from positive tagged documents
#     words=word_tokenize(p) # word tokenizing from the lines
#     pos=nltk.pos_tag(words) # getting the POS of the words
    
#     for w in pos: # w[1][0] gives the first letter of POS of the word
#         if w[1][0] in allowed_word_types: # checking for adjectives
#             all_words.append(w[0].lower())# w[0] gives the word and POS
 

# for p in short_neg.split('\n'): # extracting all separate lines from negative tagged documents
#     documents.append((p,"neg")) # appending all lines from positive negative documents
#     words=word_tokenize(p) # word tokenizing from the lines
#     pos=nltk.pos_tag(words) # getting the POS of the words
    
#     for w in pos: # w[1][0] gives the first letter of POS of the word
#         if w[1][0] in allowed_word_types: # checking for adjectives
#             all_words.append(w[0].lower())# w[0] gives the word and POS

# # saving documents to pickles

# save_documents=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/documents.pickle","wb")
# pickle.dump(documents,save_documents)
# save_documents.close()

# all_words=nltk.FreqDist(all_words)
# word_features=list(all_words.keys())[:5000] # get top 5000 frequent words 

# save_word_features=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/word_features5k.pickle","wb")
# pickle.dump(word_features,save_word_features)
# save_word_features.close()

# # check if the features are in top words
# def find_features(document):
#     words=word_tokenize(document)
#     features={}
#     for w in word_features:
#         features[w]=(w in words)
    
#     return features

# # building featuresets
# featuresets=[(find_features(rev),category) for (rev,category) in documents]

# random.shuffle(featuresets) #shuffling feature sets for train test

# save_featuresets= open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/featuresets.pickle","wb")
# pickle.dump(featuresets,save_featuresets)
# save_featuresets.close()

In [12]:
documents_f=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/documents.pickle","rb")
documents=pickle.load(documents_f)
documents_f.close()

word_features5k_f= open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/word_features5k.pickle","rb")
word_features=pickle.load(word_features5k_f)
word_features5k_f.close()

featuresets_f= open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/featuresets.pickle","rb")
featuresets=pickle.load(featuresets_f)
featuresets_f.close()

# def find_features(document):
#     words=word_tokenize(document)
#     features={}
#     for w in word_features:
#         features[w]=(w in words)
    
#     return features

# # building featuresets
# featuresets=[(find_features(rev),category) for (rev,category) in documents]

# random.shuffle(featuresets) #shuffling feature sets for train test

training_set=featuresets[:10000]
testing_set=featuresets[10000:]

open_file=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/MNB_classifier5k.pickle","rb")
MNB_clasifier=pickle.load(open_file)
open_file.close()

open_file=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/BernoulliNB_classifier5k.pickle","rb")
BernoulliNB_clasifier=pickle.load(open_file)
open_file.close()

open_file=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/LogisticRegression_classifier5k.pickle","rb")
LogisticRegression_clasifier=pickle.load(open_file)
open_file.close()

open_file=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/SGDClassifier_classifier5k.pickle","rb")
SGDClassifier_clasifier=pickle.load(open_file)
open_file.close()

open_file=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/SVC_classifier5k.pickle","rb")
SVC_clasifier=pickle.load(open_file)
open_file.close()

open_file=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/LinearSVC_classifier5k.pickle","rb")
LinearSVC_clasifier=pickle.load(open_file)
open_file.close()

# open_file=open("/Users/vputcha/Documents_Venkat/Kaggle/NLTK/pickled_algos/NuSVC_classifier5k.pickle","rb")
# NuSVC_classifier=pickle.lead(open_file)
# open_file.close()


voted_classifier = VoteClassifier(
                                  MNB_clasifier,
                                  BernoulliNB_clasifier,
                                  LogisticRegression_clasifier,
                                  SGDClassifier_clasifier,
                                  SVC_clasifier,
                                  LinearSVC_clasifier)

In [13]:
# Function to predict sentiment of text and confidence of prediction
def sentiment(text):
    feats=find_features(text)
    
    return voted_classifer.classify(feats),voted_classifier.confidence(feats)