In [1]:
import nltk
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from symspellpy import SymSpell, Verbosity
import string
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd 
import numpy as np 
from textblob import TextBlob
import text2emotion as te
import pdb
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from os import getcwd
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import time
import joblib

stop_words = stopwords.words('english')
lemma = WordNetLemmatizer()
#spell = SpellChecker()
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
 
nltk.download('stopwords')
nltk.download('twitter_samples')

# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))
     
##clean symbols
def deEmojify(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"                 # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)

def clean_data(data):
    lower_msg = "".join([i.lower() for i in data if i not in string.punctuation]) #lower all charracter
    msg = deEmojify(lower_msg)                                                    ##clear emoji
    msg = re.sub('[^A-Za-z]+', ' ', msg)                                          ##clean symbol
    msg = re.sub('(.x?)http.*?(.*?)', ' ', msg)                                   #clean url
    return msg

def clr_stop_words(words):  ##clear stop words
    filter_words = []
    for w in words:
        if w not in stop_words:
            filter_words.append(w)
    return filter_words  

def spell_checker(words):
    mispelled = []
    for w in words:
        text = sym_spell.lookup(w,Verbosity.CLOSEST,max_edit_distance=2, include_unknown=True)
        for t in text:
            mispelled.append(t._term)
            
    return mispelled

def lemma_words(words):
    lemma_word = []
    for w in words:
        text = lemma.lemmatize(w)
        lemma_word.append(text) 
    return lemma_word   

def process_tweet(tweet):
    # '''
    # Input:
    #     tweet: a string containing a tweet
    # Output:
    #     tweets_clean: a list of words containing the processed tweet
    # '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def test_lookup(func):
    freqs = {('sad', 0): 4,
             ('happy', 1): 12,
             ('oppressed', 0): 7}
    word = 'happy'
    label = 1
    if func(freqs, word, label) == 12:
        return 'SUCCESS!!'
    return 'Failed Sanity Check!'


def lookup(freqs, word, label):
    # '''
    # Input:
    #     freqs: a dictionary with the frequency of each pair (or tuple)
    #     word: the word to look up
    #     label: the label corresponding to the word
    # Output:
    #     n: the number of times the word with its corresponding label appears.
    # '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

#################################################
#Process data
#Implementing helper functions

def count_tweets(result, tweets, ys):
    # '''
    # Input:
    #     result: a dictionary that will be used to map each pair to its frequency
    #     tweets: a list of tweets
    #     ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    # Output:
    #     result: a dictionary mapping each pair to its frequency
    # '''

    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word,y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    return result

##################################################
#Train model using naive bayes 
freqs = count_tweets({}, train_x, train_y)

def train_naive_bayes(freqs, train_x, train_y):
    # '''
    # Input: 
    #     freqs: dictionary from (word, label) to how often the word appears
    #     train_x: a list of tweets
    #     train_y: a list of labels correponding to the tweets (0,1)
    # Output: 
    #     logprior: the log prior. (equation 3 above)
    #     loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    # '''
    loglikelihood = {}
    logprior = 0 
    
   
    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    
    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos=N_neg=V_pos=V_neg=0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # increment the count of unique positive words by 1
            V_pos += 1
            
            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]
        
        # else, the label is negative
        else: 
            # increment the count of unique negative words by 1
            V_neg += 1
            
            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = train_y.shape[0]
    
    # Calculate D_pos, the number of positive documents
    D_pos = train_y[train_y == 1].shape[0]
    
    # Calculate D_neg, the number of negative documents
    D_neg = train_y[train_y == 0].shape[0]
    
    # Calculate logprior
    logprior  = np.log(D_pos / D) - np.log(D_neg / D)
    
    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word, 1), 0)
        freq_neg = freqs.get((word, 0), 0)
        
        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
     
        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)

    
    return logprior, loglikelihood
start = time.time()
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
stop = time.time()
print(logprior)
print(len(loglikelihood))

###############################################
#Test naive bayes
def naive_bayes_predict(tweet, logprior, loglikelihood):
    # '''
    # Input:
    #     tweet: a string
    #     logprior: a number
    #     loglikelihood: a dictionary of words mapping to numbers
    # Output:
    #     p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    # '''
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p


# # Experiment with your own tweet.
# my_tweet = 'She smiled.'
# p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
# print('The expected output is', p)

y_hats = []
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    # """
    # Input:
    #     test_x: A list of tweets
    #     test_y: the corresponding labels for the list of tweets
    #     logprior: the logprior
    #     loglikelihood: a dictionary with the loglikelihoods for each word
    # Output:
    #     accuracy: (# of tweets classified correctly)/(total # of tweets)
    # """
    accuracy = 0  # return this properly


    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = sum(abs(test_y-y_hats))/len(y_hats)

    # Accuracy is 1 minus the error
    accuracy = 1-error

    return accuracy

# for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
#     # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
#     p = naive_bayes_predict(tweet, logprior, loglikelihood)
# #     print(f'{tweet} -> {p:.2f} ({p_category})')
#     print(f'{tweet} -> {p:.2f}')
    
# Feel free to check the sentiment of your own tweet below
my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

######################################
#Filter words by ratio of positive to negative counts

def get_ratio(freqs, word):
    # '''
    # Input:
    #     freqs: dictionary containing the words
    #     word: string to lookup

    # Output: a dictionary with keys 'positive', 'negative', and 'ratio'.
    #     Example: {'positive': 10, 'negative': 20, 'ratio': 0.5}
    # '''
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    # use lookup() to find positive counts for the word (denoted by the integer 1)
    pos_neg_ratio['positive'] = lookup(freqs, word, 1)

    # use lookup() to find negative counts for the word (denoted by integer 0)
    pos_neg_ratio['negative'] = lookup(freqs, word, 0)

    # calculate the ratio of positive to negative counts for the word
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive']+1)/(pos_neg_ratio['negative']+1)
    return pos_neg_ratio

def get_words_by_threshold(freqs, label, threshold):
    # '''
    # Input:
    #     freqs: dictionary of words
    #     label: 1 for positive, 0 for negative
    #     threshold: ratio that will be used as the cutoff for including a word in the returned dictionary
    # Output:
    #     word_set: dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts.
    #     example of a key value pair:
    #     {'happi':
    #         {'positive': 10, 'negative': 20, 'ratio': 0.5}
    #     }
    # '''
    word_list = {}

    for key in freqs.keys():
        word, _ = key

        # get the positive/negative ratio for a word
        pos_neg_ratio = get_ratio(freqs, word)

        # if the label is 1 and the ratio is greater than or equal to the threshold...
        if label == 1 and pos_neg_ratio['ratio']>= threshold:

            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

        # If the label is 0 and the pos_neg_ratio is less than or equal to the threshold...
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:

            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

        # otherwise, do not include this word in the list (do nothing)

    return word_list

# Test  function: find negative words at or below a threshold
get_words_by_threshold(freqs, label=0, threshold=0.05)

# Test  function; find positive words at or above a threshold
get_words_by_threshold(freqs, label=1, threshold=10)

##############################################
# #Error analysis
# # Some error analysis done for you
# print('Truth Predicted Tweet')
# for x, y in zip(test_x, test_y):
#     y_hat = naive_bayes_predict(x, logprior, loglikelihood)
#     if y != (np.sign(y_hat) > 0):
#         print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
#             process_tweet(x)).encode('ascii', 'ignore')))

acscore = []
acprecision = []
acrecall = []
acF1 = []
acTrainingTime = []

trainingTime = stop - start
print(f"Training time: {trainingTime}s")
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))


# Making the confusion matrix and calculating accuracy score
cm = confusion_matrix(test_y, y_hats)
ac = accuracy_score(test_y, y_hats)
precision = precision_score(test_y, y_hats)
recall = recall_score(test_y, y_hats)
f1 = f1_score(test_y, y_hats)

acscore.append(ac)
acprecision.append(precision)
acrecall.append(recall)
acF1.append(f1)
acTrainingTime.append(trainingTime)

print(cm)
print('Accuracy score: {0:0.4f}'.format(ac))
print('Precision score: {0:0.4f}'.format(precision))
print('Recall score: {0:0.4f}'.format(recall))
print('F1 score: {0:0.4f}'.format(f1))

[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     ZBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\HP
[nltk_data]     ZBook\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HP
[nltk_data]     ZBook\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     ZBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to C:\Users\HP
[nltk_data]     ZBook\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


0.0
9084
Training time: 0.0488734245300293s
Naive Bayes accuracy = 0.9940
[[996   4]
 [  8 992]]
Accuracy score: 0.9940
Precision score: 0.9960
Recall score: 0.9920
F1 score: 0.9940


In [18]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tkinter import *
import re
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

def clearAll() :
	negativeField.delete(0, END)
	neutralField.delete(0, END)
	positiveField.delete(0, END)
	overallField.delete(0, END)
	polarityField.delete(0, END)
	textArea.delete(1.0, END)

def clearResult() :
	negativeField.delete(0, END)
	neutralField.delete(0, END)
	positiveField.delete(0, END)
	overallField.delete(0, END)
	polarityField.delete(0, END)

def detect_sentiment():
	# get a whole input content from text box
	sentence = textArea.get("1.0", "end")
	
	# Create a SentimentIntensityAnalyzer object.
	sid_obj = SentimentIntensityAnalyzer()
	
	# polarity_scores method of SentimentIntensityAnalyzer
	# object gives a sentiment dictionary.
	# which contains pos, neg, neu, and compound scores.
	sentiment_dict = sid_obj.polarity_scores(sentence)
	
	msgValue = sentence.lower()
	msgValue = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", msgValue)
	msgValue = " ".join([word for word in msgValue.split() if word not in (stop)])
	
	p = naive_bayes_predict(msgValue, logprior, loglikelihood)
	polarityField.insert(10, p)
	
	string = str(round(sentiment_dict['neg']*100,2)) + "% Negative"
	negativeField.insert(10, string)
	
	string = str(round(sentiment_dict['neu']*100,2)) + "% Neutral"
	neutralField.insert(10, string)
	
	string = str(round(sentiment_dict['pos']*100,2)) +"% Positive"
	positiveField.insert(10, string)
	
	# decide sentiment as positive, negative and neutral
	if sentiment_dict['compound'] >= 0.05 :
		string = "Positive"
	
	elif sentiment_dict['compound'] <= - 0.05 :
		string = "Negative"
	
	else :
		string = "Neutral"
	
	overallField.insert(10, string)

# Driver Code
if __name__ == "__main__" :
	
	# Create a GUI window
	gui = Tk()
	gui.config(background = "Light Blue")
	gui.title("Sentiment Detector")
	gui.geometry("360x615")
    
	# create a label : Enter Your Task
	enterText = Label(gui, text = "Enter Your Sentence", bg = "light blue", font=20)

	# create a text area for the root
	# with lunida 13 font
	# text area is for writing the content
	textArea = Text(gui, height = 5, width = 37, font = "lucida 13")

	# create a Submit Button
	check = Button(gui, text = "Check Sentiment", fg = "White", bg = "Green", 
                   command = lambda:[clearResult(),detect_sentiment()],
                   height = 1, width = 30,font=1)

	# Create a negative : label
	polarity = Label(gui, text = "Sentence Polarity: ", bg = "light blue", font=20)
	negative = Label(gui, text = "Sentence Negativity: ", bg = "light blue", font=20)
	neutral = Label(gui, text = "Sentence Neutrality: ", bg = "light blue", font=20)
	positive = Label(gui, text = "Sentence Positivity: ", bg = "light blue", font=20)
	overall = Label(gui, text = "Sentence Overall Sentiment: ", bg = "light blue", font=20)

	# create text entry box
	polarityField = Entry(gui,width=20,font=20,justify='center',background='light blue')
	negativeField = Entry(gui,width=20,font=20,justify='center',background='light blue')
	neutralField = Entry(gui,width=20,font=20,justify='center',background='light blue')
	positiveField = Entry(gui,width=20,font=20,justify='center',background='light blue')
	overallField = Entry(gui,width=17,font=('Georgia 20'),justify='center',background='White')

	# create Buttons
	clear = Button(gui, text = "Reset", fg = "Black", bg = "Pink", command = clearAll, height = 1, width = 15)
	Exit = Button(gui, text = "Exit", fg = "White", bg = "Red", command = gui.destroy, height = 1, width = 15)

	# grid method is used for placing
	# the widgets at respective positions
	# in table like structure.
	enterText.grid(row = 1, column = 2)
	
	textArea.grid(row = 2, column = 2, padx = 10, sticky = W)
	
	check.grid(row = 3, column = 2)
	
	polarity.grid(row = 5, column = 2)
	polarityField.grid(row = 6, column = 2)
    
	negative.grid(row = 7, column = 2)
	negativeField.grid(row = 8, column = 2)
    
	neutral.grid(row = 9, column = 2)
	neutralField.grid(row = 10, column = 2)
    
	positive.grid(row = 11, column = 2)
	positiveField.grid(row = 12, column = 2)
    
	overall.grid(row = 14, column = 2)
	overallField.grid(row = 15, column = 2)

	clear.grid(row = 17, column = 2)
	Exit.grid(row = 18, column = 2)

	col_count, row_count = gui.grid_size()

	for col in range(col_count):
		gui.grid_columnconfigure(col, minsize=0)

	for row in range(row_count):
		gui.grid_rowconfigure(row, minsize=20)
    
	# start the GUI
	gui.mainloop()

[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     ZBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
