<h1><center>Sentiment Analysis</center></h1>


### 1. Datasets: ISEAR
### 2. Features: Bag of Words of Unigram, Bigrams and Trigrams
### 3. Classifiers: Naive Bayes Classifier, Maximum Entropy Classifier and Support Vector Classifier

#### Importing modules

In [1]:
# Import nltk modules
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.util import ngrams

# Import Sklearn Libraries
from sklearn.svm import LinearSVC, SVC
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB

# Import other modules
import numpy as np # To Make the list and array computations easy
import pandas # To read the data from the excel sheet
import time # To keep track of the time taken in training and testing
import math # To use the log function for maximum likelihood probabilities
import random # To shuffle the data for different epochs
import threading # To interact with GUI
import SentimentGUI as gui # GUI class

#### Initialization

In [2]:
stop_words = set(stopwords.words('english'))
lmtzr = WordNetLemmatizer()
ps = PorterStemmer()
n = 1 ## Can be 1,2,3
train_count = 6500
test_count = 1000
epochs = 20

#### Reading the data from the excel sheet

In [3]:
def read_file(filename):
    df = pandas.read_excel(filename)
    values = df['SIT'].values
    label=df['Field1'].values
    return values.tolist(),label.tolist()

#### Tokenizing the sentence

In [4]:
def tokenize_sentence(sentence):
    return word_tokenize(sentence)

#### Lower Case

In [5]:
def lower_case(tokens):
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    return tokens

#### Removing Stop-words
{‘ourselves’, ‘hers’, ‘between’, ‘yourself’, ‘but’, ‘again’, ‘there’, ‘about’, ‘once’, ‘during’, ‘out’, ‘very’, ‘having’, ‘with’, ‘they’, ‘own’, ‘an’, ‘be’, ‘some’, ‘for’, ‘do’, ‘its’, ‘yours’, ‘such’, ‘into’, ‘of’, ‘most’, ‘itself’, ‘other’, ‘off’, ‘is’, ‘s’, ‘am’, ‘or’, ‘who’, ‘as’, ‘from’, ‘him’, ‘each’, ‘the’, ‘themselves’, ‘until’, ‘below’, ‘are’, ‘we’, ‘these’, ‘your’, ‘his’, ‘through’, ‘don’, ‘nor’, ‘me’, ‘were’, ‘her’, ‘more’, ‘himself’, ‘this’, ‘down’, ‘should’, ‘our’, ‘their’, ‘while’, ‘above’, ‘both’, ‘up’, ‘to’, ‘ours’, ‘had’, ‘she’, ‘all’, ‘no’, ‘when’, ‘at’, ‘any’, ‘before’, ‘them’, ‘same’, ‘and’, ‘been’, ‘have’, ‘in’, ‘will’, ‘on’, ‘does’, ‘yourselves’, ‘then’, ‘that’, ‘because’, ‘what’, ‘over’, ‘why’, ‘so’, ‘can’, ‘did’, ‘not’, ‘now’, ‘under’, ‘he’, ‘you’, ‘herself’, ‘has’, ‘just’, ‘where’, ‘too’, ‘only’, ‘myself’, ‘which’, ‘those’, ‘i’, ‘after’, ‘few’, ‘whom’, ‘t’, ‘being’, ‘if’, ‘theirs’, ‘my’, ‘against’, ‘a’, ‘by’, ‘doing’, ‘it’, ‘how’, ‘further’, ‘was’, ‘here’, ‘than’}

In [6]:
def removeStopWords(tokens):
    return [w for w in tokens if not w in stop_words]

#### Lemmatizing the words

In [7]:
def lemmatize_words(tokens):
    return [lmtzr.lemmatize(w) for w in tokens]

#### Stemming the words

In [8]:
def root_words(tokens):
    return [ps.stem(w) for w in tokens]

#### Get n-grams word list

In [9]:
def get_ngrams(tokens,n):
    return [list(i)[0:n] for i in ngrams(tokens,n)]

In [10]:
## Return all the ngrams from unigram to n-gram
def get_allngrams(tokens,n):
    ngrams_list = []
    for k in range(1,n+1):
        ngrams_list.extend([list(i)[0:k] for i in ngrams(tokens,k)])
    return ngrams_list

#### Remove the words with length less than 2

In [11]:
def remove_small_words(tokens):
    new_tokens = []
    for i in range(len(tokens)):
        if(len(tokens[i]) > 2):
            new_tokens.append(tokens[i])
    return new_tokens

#### Processing of data

In [12]:
def data_processing(sentence_list,label_list):
    sentence_count = len(sentence_list)
    feature_list = []
    vertical_stack = []
    for i in range(sentence_count):
        tokens = tokenize_sentence(sentence_list[i])
        tokens = removeStopWords(tokens)
        
        ### Can choose option between stemming and lemmatization
        #tokens = root_words(tokens)
        tokens = lemmatize_words(tokens)
        
        tokens = remove_small_words(tokens)
        
        # All Combinations of n-grams
        #ngram_list = get_allngrams(tokens,n)
        ngram_list = get_ngrams(tokens,n)
        new_num_list = []
        for j in range(len(ngram_list)):
            ngram_list[j].sort()
            vertical_stack.extend(ngram_list[j])
            new_num_list.extend(ngram_list[j])
        feature_list.append(new_num_list)

    return feature_list, vertical_stack

#### Unique Entries

In [13]:
def unique_entry(vertical_stack):
    # #### Unique entries in the vertical stack
    processed_set = []
    for i in range(len(vertical_stack)):
        if(not vertical_stack[i] in processed_set):
            processed_set.append(vertical_stack[i])
    return processed_set

#### Scoring of features

In [14]:
def score_features(feature_list, processed_set, list_class, label_list):
    weighted_log_likelihood = np.array([[0.0 for column in range(len(processed_set))] for row in range(7)])
    weighted_log_likelihood_score = np.array([[0.0 for column in range(len(processed_set))] for row in range(7)])

    for i in range(len(feature_list)):
        for j in range(len(feature_list[i])):
            row = list_class.index(label_list[i])
            col = processed_set.index(feature_list[i][j])
            weighted_log_likelihood[row][col] += 1
    
    col_sum = np.sum(weighted_log_likelihood, axis = 0)
    row_sum = np.sum(weighted_log_likelihood, axis = 1)
    total_sum = np.sum(row_sum)
    

    for i in range(7):
        for j in range(len(processed_set)):
            p_w_c = weighted_log_likelihood[i][j] / row_sum[i]
            p_w_notc = (col_sum[j] - weighted_log_likelihood[i][j]) / (total_sum - row_sum[i])
            
            ratio = 1
            if(p_w_notc == 0):
                ratio = 0
            else:
                ratio = p_w_c / p_w_notc
            
            # print(weighted_log_likelihood[i][j],row_sum[i], p_w_c, p_w_notc, ratio)
            # If word doesn't exist in other classes
            if(p_w_notc == 0):
                weighted_log_likelihood_score[i][j] = p_w_c * 10
            elif(ratio > 0):
                value = p_w_c * math.log(ratio)
                weighted_log_likelihood_score[i][j] = value
    return weighted_log_likelihood, weighted_log_likelihood_score
    

#### Top Scorers List

In [15]:
def top_scorers(weighted_log_likelihood_score):
    top_contendants = list(zip(weighted_log_likelihood_score.max(0), weighted_log_likelihood_score.argmax(0)))
    dtype = [('score', float), ('class', int)]
    top_contendants = np.array(top_contendants, dtype=dtype)
    top_contendants = np.sort(top_contendants, order='score')[::-1]
    return top_contendants

#### Choosing only the features which has a significant score

In [16]:
def filtered_words(top_contendants, processed_set):
    new_processed_set = []
    for i in range(len(processed_set)):
        if(top_contendants[i][0] > 0.0009999):
            new_processed_set.append(processed_set[i])
    # print(len(new_processed_set))
    return new_processed_set
    

#### Bag of Words

In [17]:
def bag_of_words(feature_list, new_processed_set):
    word_frequency = []
    for i in range(len(feature_list)):
        word_in_sentence = [0 for row in range(len(new_processed_set))]
        for j in range(len(feature_list[i])):
            if(feature_list[i][j] in new_processed_set):
                key = new_processed_set.index(feature_list[i][j])
                word_in_sentence[key] += 1
        word_frequency.append(word_in_sentence)

    return word_frequency

In [18]:
def bag_of_words_single(feature_list, new_processed_set):
    word_frequency = []
    word_in_sentence = [0 for row in range(len(new_processed_set))]
    for j in range(len(feature_list)):
        if(feature_list[j] in new_processed_set):
            key = new_processed_set.index(feature_list[j])
            word_in_sentence[key] += 1

    return word_in_sentence

#### Training and Testing Data

### Naive Bayes Classifier

In [19]:
class NB:
    def __init__(self, max_iter = 1, method = "Normal"):
        self.max_iter = max_iter
        self.method = method
        self.n_feature = 0
        self.n_class = 0
        self.n_train = 0
        self.score = 0
        self.list_class = []        
        self.probability_class = {}
        
        
    
    def fit(self, train_X, train_Y):      
        self.train_X = train_X
        self.train_Y = train_Y
        self.n_feature = len(train_X[0])
        self.list_class = list(set(train_Y))
        self.n_class = len(self.list_class)
        self.n_train = len(train_Y)
        
        self.probability_class = {self.list_class[i] : train_Y.count(self.list_class[i])/self.n_train for i in range(self.n_class)}
        
        # Conditional Feature probabilities
        self.probability_feature_class = [[0.0 for col in range(self.n_feature)] for row in range(self.n_class)]
        
        for i in range(self.n_train):
            for j in range(self.n_feature):
                self.probability_feature_class[self.list_class.index(train_Y[i])][j] += train_X[i][j]
        
        # Joint Feature Probabilities
        for i in range(self.n_class):
            temp = sum(self.probability_feature_class[i])
            for j in range(self.n_feature):
                self.probability_feature_class[i][j] /= temp

        
    def predict(self, test_X):
        max_proximal = []
        for i in range(self.n_class):
            temp_prob = 1
            for j in range(self.n_feature):
                if(test_X[j] != 0):
                    temp_prob *= test_X[j] * self.probability_feature_class[i][j]

            max_proximal.append(temp_prob)

        return self.list_class[max_proximal.index(max(max_proximal))]


    def score_(self, test_X, test_Y):       
        correct = 0
        for i in range(len(test_X)):
            if(self.predict(test_X[i]) == test_Y[i]):
                correct += 1

        self.score = correct/len(test_X)
        return self.score


### Maximum Entropy

In [20]:
class ME:
    def __init__(self, max_iter = 1, method = "Normal"):
        self.max_iter = max_iter
        self.method = method
        self.n_feature = 0
        self.n_class = 0
        self.n_train = 0
        self.score = 0
        self.list_class = []
                
    
    def fit(self, train_X, train_Y):
        
        self.train_X = train_X
        self.train_Y = train_Y
        self.n_feature = len(train_X[0])
        self.list_class = list(set(train_Y))
        self.n_class = len(self.list_class)
        self.n_train = len(train_Y)
        
        # Conditional Feature probabilities
        self.probability_feature_class = np.array([[0.0 for col in range(self.n_feature)] for row in range(self.n_class)])
        
        for i in range(self.n_train):
            for j in range(self.n_feature):
                self.probability_feature_class[self.list_class.index(train_Y[i])][j] += train_X[i][j]
        
        
        feature_sum = np.sum(self.probability_feature_class, axis = 0)
        self.sample_size = np.sum(feature_sum)
        self.entropy = np.array([[float("-inf") for col in range(self.n_feature)] for row in range(self.n_class)])
        
        
        for i in range(self.n_feature):
            feature_prb = feature_sum[i] / self.sample_size
            for j in range(self.n_class):
                prob_y_x = (1.0 * self.probability_feature_class[j][i]) / feature_sum[i]
#                 if(feature_sum[i] == 0):
#                 #print(feature_prb, prob_y_x)
                if(prob_y_x > 0):
                    
                    if(prob_y_x == 1):
                        prob_y_x = 0.99999999999
                    
                    self.entropy[j][i] = feature_prb * prob_y_x * math.log(prob_y_x)
                    #print(feature_prb, feature_sum[i], self.sample_size,prob_y_x, self.entropy[j][i])
                    #print(self.entropy[i][j])
                    

        #print(np.array(self.entropy))     
        
    def predict(self, test_X):
        
        max_proximal = [0.0 for row in range(self.n_class)]
        
        for j in range(self.n_feature):
            if(test_X[j] != 0):
                for i in range(self.n_class):
                    if(self.entropy[i][j] != float("-inf")):
                        max_proximal[i] += (-1.0 * self.entropy[i][j])
        
        #print(np.array(max_proximal))
        return self.list_class[np.argmax(max_proximal)]

    
    def score_(self, test_X, test_Y):       
        correct = 0.0
        for i in range(len(test_X)):
            if(self.predict(test_X[i]) == test_Y[i]):
                correct += 1
        print(correct)
        self.score = correct/len(test_X)
        return self.score


### Support Vector Classifier

In [21]:
class Svc:
    def __init__(self, max_iter = 1, method = "Normal"):
        self.max_iter = max_iter
        self.method = method
        self.n_feature = 0
        self.n_class = 0
        self.n_train = 0
        self.score = 0
        self.list_class = []
                
    
    def fit(self, train_X, train_Y):
        
        self.train_X = train_X
        self.train_Y = train_Y
        self.n_feature = len(train_X[0])
        self.list_class = list(set(train_Y))
        self.n_class = len(self.list_class)
        self.n_train = len(train_Y)
        
        # Linear Division
        self.probability_feature_class = np.array([[0.0 for col in range(self.n_feature)] for row in range(self.n_class)])
        
        for i in range(self.n_train):
            for j in range(self.n_feature):
                self.probability_feature_class[self.list_class.index(train_Y[i])][j] += train_X[i][j]
        
        self.feature_max = np.max(self.probability_feature_class, axis = 0)       
        self.feature_argmax = np.argmax(self.probability_feature_class, axis = 0)
        
    
        
    def predict(self, test_X):
        
        max_proximal = [0.0 for row in range(self.n_class)]        
        for i in range(self.n_feature):
            if(test_X[i] != 0):
                max_proximal[self.feature_argmax[i]] += test_X[i] * self.feature_max[i]
                
        return self.list_class[np.argmax(max_proximal)]

    
    def score_(self, test_X, test_Y):       
        correct = 0.0
        for i in range(len(test_X)):
            if(self.predict(test_X[i]) == test_Y[i]):
                correct += 1
        print(correct)
        self.score = correct/len(test_X)
        return self.score


### Standard Models

#### SVC

In [22]:
def svmModel(word_frequency, label_list):
    svc = LinearSVC()
    
    score = []
    combined_data = list(zip(word_frequency, label_list))
    classifier = NB()

    for i in range(epochs):
        print("Epoch", i+1)
        start_time = time.time()

        # Shuffle the data
        random.shuffle(combined_data)
        word_frequency, label_list = zip(*combined_data)

        train_X = word_frequency[:train_count]
        test_X = word_frequency[train_count:train_count+test_count]
        train_Y = label_list[:train_count]
        test_Y = label_list[train_count:train_count + test_count]

        svc.fit(train_X, train_Y)
        score.append(round(svc.score(test_X, test_Y), 4))
        print("Score", score[i])

        end_time = time.time()
        print("Training took: ", round(end_time - start_time,4))

#### Maximum ENtropy

In [23]:
def entropyModel(word_frequency, label_list):
    logreg = linear_model.LogisticRegression(C = 1e5)
    
    score = []
    combined_data = list(zip(word_frequency, label_list))
    classifier = NB()

    for i in range(epochs):
        print("Epoch", i+1)
        start_time = time.time()

        # Shuffle the data
        random.shuffle(combined_data)
        word_frequency, label_list = zip(*combined_data)

        train_X = word_frequency[:train_count]
        test_X = word_frequency[train_count:train_count+test_count]
        train_Y = label_list[:train_count]
        test_Y = label_list[train_count:train_count + test_count]

        logreg.fit(train_X, train_Y)
        score.append(round(logreg.score(test_X, test_Y), 4))
        print("Score", score[i])

        end_time = time.time()
        print("Training took: ", round(end_time - start_time,4))

#### Naive Bayes

In [24]:
def nbModel(word_frequency, label_list):
    gnb = GaussianNB()
    
    score = []
    combined_data = list(zip(word_frequency, label_list))
    classifier = NB()

    for i in range(epochs):
        print("Epoch", i+1)
        start_time = time.time()

        # Shuffle the data
        random.shuffle(combined_data)
        word_frequency, label_list = zip(*combined_data)

        train_X = word_frequency[:train_count]
        test_X = word_frequency[train_count:train_count+test_count]
        train_Y = label_list[:train_count]
        test_Y = label_list[train_count:train_count + test_count]

        gnb.fit(train_X, train_Y)
        score.append(round(gnb.score(test_X, test_Y), 4))
        print("Score", score[i])

        end_time = time.time()
        print("Training took: ", round(end_time - start_time,4))

### Interaction with GUI

In [25]:
def gui_process(sentence, new_processed_set, classifier):
    vertical_stack1 = []
    feature_list1 = []
    token = tokenize_sentence(sentence)
    token = removeStopWords(token)
    token = lemmatize_words(token)
    token = remove_small_words(token)
    #ngram = get_allngrams(tokens,n)
    ngram = get_ngrams(token,n)
    new_num = []
    for j in range(len(ngram)):
        ngram[j].sort()
        vertical_stack1.extend(ngram[j])
        new_num.extend(ngram[j])
    feature_list1.append(new_num)
    processed_set1 = unique_entry(vertical_stack1)
    print(processed_set1)

    if(len(processed_set1) != 0):
        sentence_features = []
        word_frequency = bag_of_words_single(processed_set1, new_processed_set)
        temp_word_frequency = []
        temp_word_frequency.append(word_frequency)
        predicted_Y = classifier.predict(temp_word_frequency)
        print(predicted_Y)
        gui.show_image(predicted_Y)


#### Run the Classifier for 20 epochs

In [26]:
def run_classifier(word_frequency, label_list, classifier = NB()):
    score = []
    combined_data = list(zip(word_frequency, label_list))
    
    for i in range(epochs):
        print("Epoch", i+1)
        start_time = time.time()

        # Shuffle the data
        random.shuffle(combined_data)
        word_frequency, label_list = zip(*combined_data)

        train_X = word_frequency[:train_count]
        test_X = word_frequency[train_count:train_count+test_count]
        train_Y = label_list[:train_count]
        test_Y = label_list[train_count:train_count + test_count]

        classifier.fit(train_X, train_Y)
        score.append(classifier.score_(test_X, test_Y))
        end_time = time.time()
        print("Time", round(end_time - start_time,4), "Score", score[i])

### Main Function

In [27]:
def sentiment_extraction():

    # #### Calling Processing functions
    sentence_list,label_list = read_file("/Users/abhianshusingla/Documents/Multi-Sentiment Analysis/data/new_data.xlsx")
    list_class = list(set(label_list))
    feature_list, vertical_stack = data_processing(sentence_list,label_list)
    #print(len(feature_list), len(vertical_stack))
    processed_set = unique_entry(vertical_stack)
    #print(len(processed_set), len(vertical_stack))
    weighted_log_likelihood, weighted_log_likelihood_score = score_features(feature_list, processed_set,list_class,label_list)
    top_contendants = top_scorers(weighted_log_likelihood_score)
    new_processed_set = filtered_words(top_contendants, processed_set)
    word_frequency = bag_of_words(feature_list, new_processed_set)
    
    print("Start Training")
    
#     print("### Naive Bayes ###")
#     run_classifier(word_frequency, label_list, classifier = NB())
    
#     print("### Maximum Entropy ###")
#     run_classifier(word_frequency, label_list, classifier = ME())
    
#     print("### Support Vector Machine ###")
#     run_classifier(word_frequency, label_list, classifier = Svc())
        
#     print("### Standard SVM ###")
#     svmModel(word_frequency, label_list)
    
#     print("### Standard Entropy ###")
#     entropyModel(word_frequency, label_list)
    
#     print("### Standard Naive Bayes ###")
#     nbModel(word_frequency, label_list)
    
    # Training
    train_X = word_frequency[:train_count]
    test_X = word_frequency[train_count:train_count+test_count]
    train_Y = label_list[:train_count]
    test_Y = label_list[train_count:train_count + test_count]

    classifier = LinearSVC()
    classifier.fit(train_X, train_Y)
    score = classifier.score(test_X, test_Y)
    print(score)
    
#     temp = []
#     temp.append(test_X[0])
#     print("Prediction", classifier.predict(temp))

    sentence = ""
    while(True):
        new_sentence = gui.getSentence()
        if(sentence != new_sentence):           
            
            sentence = new_sentence
            gui_process(sentence, new_processed_set,classifier)


#### Thread Part

In [28]:
# Making the Threads for communicating with the grid-world map
Sentiment_thread = threading.Thread(target = sentiment_extraction)
Sentiment_thread.daemon = True
Sentiment_thread.start()

# Start GUI
gui.start()

Start Training
0.507
['Passed', 'exam']
['joy']
[]
[]
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad']
['anger']
['Mad', 'fri']
['anger']
['Mad', 'frie']
['anger']
['Mad', 'frien']
['anger']
['Mad', 'friend']
['anger']
[]
[]
['Say']
['joy']
['Sayi']
['joy']
['Sayin']
['joy']
['Saying']
['joy']
['Saying']
['joy']
['Saying']
['joy']
['Saying']
['joy']
['Saying', 'goo']
['joy']
['Saying', 'good']
['joy']
['Saying', 'good-']
['joy']
['Saying', 'good-b']
['joy']
['Saying', 'good-by']
['joy']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['Saying', 'good-bye']
['sadness']
['

Exception in Tkinter callback
Traceback (most recent call last):
  File "/Users/abhianshusingla/anaconda3/lib/python3.6/tkinter/__init__.py", line 1699, in __call__
    return self.func(*args)
  File "/Users/abhianshusingla/anaconda3/lib/python3.6/tkinter/__init__.py", line 2057, in destroy
    Misc.destroy(self)
  File "/Users/abhianshusingla/anaconda3/lib/python3.6/tkinter/__init__.py", line 589, in destroy
    self.tk.deletecommand(name)
_tkinter.TclError: can't delete Tcl command
