## Import Statements

In [206]:
import gensim.downloader as api
import numpy as np
import re
import csv
import pandas as pd
import pprint
import string
import nltk
import sys
import spacy

from IPython.display import HTML
from nltk.corpus import wordnet 
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

pathToDatasets = '../datasets/'
pathToDataScripts = '../datasets/scripts/'
filePath = '../datasets/GoogleNews-vectors-negative300.bin'

sys.path.insert(0, pathToDataScripts)
from cleanDataset import tokenize_words 


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Downloading binaries and models


In [207]:
print("Should I reload the model?")
tstString = input()
if("no" in tstString.lower()):
    print(" didnt reload model! ")
else:
    print("loading the model!");
    word_vectors = api.load("glove-wiki-gigaword-100")
    nltk.download('vader_lexicon')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    print("Model Loaded!")

Should I reload the model?
no
 didnt reload model! 


## Global Variables and Global Objects

In [208]:
senty = SentimentIntensityAnalyzer()
vocabulary = word_vectors.vocab;

nlp = spacy.load('en')


NUMBER_OF_ALTERNATIVES = 5
TWEET_START = 104
NUM_OF_TWEETS = 5


# VERBOSE_PRINTING = True
VERBOSE_PRINTING = False

# USE_SPACY = False
USE_SPACY = True

COLOR_PRINTING = True
#COLOR_PRINTING = False

# PRINT_NEUTRAL = True
PRINT_NEUTRAL = False

punctuation = r"\"#$%&'+-/;<=>?@[\]*^_`{|}~"

## Class for Sentences


In [209]:
from SentenceClass import *

## Utility Code

In [210]:
def printStrings(sentenceObj):
    
    newStrings = generateHTMLObjectsFromSentence(sentenceObj)
    mainSentiment = sentenceObj.ogSentiment;
    for ind, tSentence in enumerate(newStrings):
            alteredTweet = tSentence.getSentence()
            htmlText = tSentence.getHTML()
            sentimentOfNewString = senty.polarity_scores(alteredTweet)['compound']                
            if(sentimentOfNewString == mainSentiment or sentimentOfNewString == 0.0):
                displayText("{0}: {1}".format(htmlText,sentimentOfNewString),'black')
            elif(sentimentOfNewString > mainSentiment):
                displayText("{0}: {1}".format(htmlText,sentimentOfNewString),'green')
            elif(sentimentOfNewString < mainSentiment and sentimentOfNewString != 0.0):
                displayText("{0}: {1}".format(htmlText,sentimentOfNewString),'red')

                
#             elif(sentimentOfNewString == 0.0):
#                 if(PRINT_NEUTRAL==True):
#                     displayText("{0}: {1}".format(htmlText,sentimentOfNewString), 'DarkGray');
#                 else:
#                     continue


def cstr(s, color='black', italics=False):
    if(COLOR_PRINTING):
        if(italics):
            return cstr("<i>{0}</i>".format(s), color);
        return "<text style=color:{}>{}</text>".format(color, s)
    else:
        return "{}".format(s)

def displayText(text, color='black'):
    if(COLOR_PRINTING):
        display(HTML(cstr(text, color)));
        return
    print("{}".format(text));
    
    
def cleanAndTokenizeText(text):
    text = text.lower();
    newString = ""
    for char in text:
        if char not in punctuation:
            newString += char
    text = word_tokenize(newString)
    return text;

def getPOSTags(tweet):
    if(USE_SPACY == False):
        tags = nltk.pos_tag(tweet)
        return tags;    
    tweet = ' '.join(tweet)
    doc = nlp(tweet)
    tags = [(token.text, token.pos_) for token in doc] # since the format expected is [text,tag]
    return tags;
    

def getAntonymsOfWords(word):
    if(word not in vocabulary):
        return []
    setOfAntonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            anton = l.antonyms()
            if(anton!=[]):
                setOfAntonyms.add(anton[0].name())
    if(len(setOfAntonyms) == 0):
        if(VERBOSE_PRINTING): print("No antonyms found for word {0}".format(word))
    return list(setOfAntonyms)

def returnReplacementsForWord(word):
    
    if(word not in vocabulary):
        print(" --- {0} not in vocabulary ---".format(word))
        return []
    possibleReplacements = [word[0] for word in word_vectors.most_similar(word,topn=NUMBER_OF_ALTERNATIVES)]
    
    if(possibleReplacements == []):
        print(" --- No replacements for word {0} ---".format(word))
    antonyms = getAntonymsOfWords(word)
    if(antonyms != []):
        possibleReplacements.extend(antonyms)
        if(VERBOSE_PRINTING): print("Some antonyms for word {0} are {1}".format(word, antonyms[:3]))
        return possibleReplacements
    return possibleReplacements
    
def posApprovedReplacements(alternativeWords, userTokens, indexOfToken):
    if(alternativeWords == []):
        return []
    tempTokens = userTokens[:]
    POSTokens = getPOSTags(tempTokens)
    validWords = []
    
    mainTag = POSTokens[indexOfToken][1]
    mainWord = userTokens[indexOfToken]
    
    for ind,word in enumerate(alternativeWords):
        tempTokens[indexOfToken] = word
        posTags = getPOSTags(tempTokens)
        newTag = (posTags[indexOfToken])[1]
        
        if(str(newTag) == str(mainTag)):
            if(VERBOSE_PRINTING): print("Word {0}[{1}] replaced with {2}[{3}]".format(mainWord, mainTag, word,newTag))
            validWords.append(word)
    if(validWords == [] and VERBOSE_PRINTING):
        print("No POS words found for word {} with tag {}".format(mainWord, mainTag));
    return validWords
    
def generateHTMLObjectsFromSentence(sentenceObj):
    
    allSentences = sentenceObj.getFinalSentences()
    indexToAlts = sentenceObj.indexToSetOfWords;
    indexToChange = list(indexToAlts.keys());
    
    listOfSentenceObjs = []
    for sentence in allSentences:
        copySentence = cleanAndTokenizeText(sentence)
        for index in indexToChange:
            copySentence[index] = cstr("[{0}]".format(copySentence[index]), "blue", italics=True);
        listOfSentenceObjs.append(SentenceWithHTML(sentence, ' '.join(copySentence)));
    
    return listOfSentenceObjs
    
    
    

## Sentence Chunking and Appending

In [211]:
def helper_combine(mainList, myList):
    '''
    helper function for CombineSentenceChunks
    '''
    newList = []
    for val in myList:
        for mainVal in mainList:
            if(VERBOSE_PRINTING): print("Combining {0} with {1}".format(' '.join(val), ' '.join(mainVal)));
            newList.append(val + mainVal);
    return newList;

def combineSentenceChunks(wholeSentence, dictOfChunks):
    '''
        Uses the helper_combine function to generate all possible combinatios and permuations of the chunks
        and any alternatives.
        
        The logic is to use the end of the sentence, apply each possible chunk from the previous key's chunks
        to every possible chunk of this key's.
        
        The helper function is used to allow us to reuse the list of alreadyGeneratedChunks and constantly
        append to them.
        
        To understand the logic better, take a look at this gist:
        https://gist.github.com/sunnyMiglani/cf85407a9e6928237b1436cc2bc95fa4
        
    '''
    reversedKeys = sorted(dictOfChunks.keys(), reverse=True)
    completeSentences = [];
    mainArr = dictOfChunks[reversedKeys[0]]
    for ind in range(1, len(reversedKeys)):
        key = reversedKeys[ind];
        mainArr = helper_combine(mainArr, dictOfChunks[key]);
        
    return mainArr;
        
def generateSentenceChunks(wholeSentence, keyToChange, nextKey, listOfMyAlternatives):
    '''
        Generates sentence chunks by iterating through the list of alternatives
        Chunking the sentence to start from current key to next key.
        This means that the sentence always goes from key 'x' to key 'y'
        
        Example:
        "I really <hate> hot chocolate, but I <prefer> hot coffee"
        Calling generateSentenceChunks will create an example sentence:
            - "<altWordForHate> hot chocolate , but I "
        
        Remember to append the first stretch of the string to the first key's chunk for proper use!
    '''
    newList = list(listOfMyAlternatives)
    newList.append(wholeSentence[keyToChange]);
    generatedSentences = []
    for myAlt in newList:
        newSentence = wholeSentence[:]
        newSentence[keyToChange] = myAlt
        if(VERBOSE_PRINTING): print("Generated : {}".format(newSentence[keyToChange:nextKey]))
        generatedSentences.append(newSentence[keyToChange:nextKey]);
        
    return generatedSentences
    
def returnCombinationsOfStrings(sentenceObj):
    
    indexToWordDict = sentenceObj.indexToSetOfWords;
    originalSentence = sentenceObj.ogSentence;
    tokenizedSentence = cleanAndTokenizeText(originalSentence)
    reversedKeys = sorted(indexToWordDict.keys(), reverse=True)
    dictAlternatives  = {}

    sortedKeys = sorted(indexToWordDict.keys())
    sentenceChunks = {}
    htmlChunks = {}
    
    for ind in range(0,len(sortedKeys)):
        key = sortedKeys[ind]
        nextKey = sortedKeys[ind+1] if ind+1 < len(sortedKeys) else len(tokenizedSentence)
        sentenceChunks[key] = generateSentenceChunks(tokenizedSentence, key, nextKey, indexToWordDict[key])

    if(sortedKeys[0] != 0):
        newList = []
        for thislist in sentenceChunks[sortedKeys[0]]:
            newList.append(tokenizedSentence[0:sortedKeys[0]] + thislist)
        sentenceChunks[sortedKeys[0]] = newList;
        
    finalOptions = combineSentenceChunks(tokenizedSentence, sentenceChunks)
    
    finalSentences = []
    for val in finalOptions:
        sentence = ' '.join(val)
        finalSentences.append(sentence)
    
    sentenceObj.resetFinalSentences()
    sentenceObj.addFinalSentences(finalSentences)   
    
    printStrings(sentenceObj)
    return sentenceObj

In [212]:
def getAlternativeSentences(sentenceObj):
    mainSentence = sentenceObj.ogSentence;
    mainSentiment = sentenceObj.ogSentiment;

    sentenceTokens = cleanAndTokenizeText(mainSentence)

    for ind, word in enumerate(sentenceTokens):
        alternativeSentenceWithHTML = []
        copyOfTokens = sentenceTokens[:]
        replacements = []
        
        score = senty.polarity_scores(word)['compound']
        if(score != 0.0):
            tempReplacements = returnReplacementsForWord(word) # get embedding based relations
            if(tempReplacements == []):
                print("No replacements found at all for word {0}".format(word))
                continue
            replacements = posApprovedReplacements(tempReplacements[:], copyOfTokens[:], ind)
            if(replacements == []):
                print(" -- No POS approved words! -- for word {0}\n some non-POS:{1}".format(word, tempReplacements[:4]))
                continue
            sentenceObj.addAlternativesByIndex(ind, replacements)

            ## Generate new sentences by switching that word
            for newWord in replacements:
                htmlFriendlyTokens = copyOfTokens[:]
                copyOfTokens[ind] = newWord
                htmlFriendlyTokens[ind] = cstr("[{0}]".format(newWord), "blue", italics=True);
                newString = ' '.join(copyOfTokens)
                tSentence = SentenceWithHTML(newString, ' '.join(htmlFriendlyTokens))
                alternativeSentenceWithHTML.append(tSentence)
        sentenceObj.addAlternativeStrings(alternativeSentenceWithHTML)

    return sentenceObj




In [213]:
def extractTwitterDataset():
    df_tweets = pd.read_csv( pathToDatasets + 'cleanedTweets.csv', nrows=NUM_OF_TWEETS, skiprows =TWEET_START)
    tweets = df_tweets.values
    return tweets;


def runThroughTweets():
    tweets = extractTwitterDataset()
    
    listOfObjects = []
    for counter,tweet in enumerate(tweets):
        tweet = tweet[0]
        mainSentiment = senty.polarity_scores(tweet)['compound']
        if(mainSentiment == 0):
            continue
        print("\n {0}:{1}\n".format(tweet,mainSentiment))   
        sentenceObj = Sentence(tweet, mainSentiment)
        sentenceObj = getAlternativeSentences(sentenceObj)
        alternateTweets = (sentenceObj.alternateSentences)[:]
        if(alternateTweets == [] or alternateTweets == None):
            print(" -- No new Strings generated ---\n\n")
            continue
        sentenceObj = returnCombinationsOfStrings(sentenceObj)
    
# runThroughTweets()

## Testing Code


In [214]:
specificWord = "good"
def testOneWord(word=""):
    if(word==""):
        return
    print(word)

In [215]:
specificString = ""
def specificString(textString=""):
    if(textString == "" or textString == None):
        return
    mainSentiment = senty.polarity_scores(textString)['compound']
    if(mainSentiment == 0):
        print("{} \n No sentiment found in sentence".format(textString));
        return;
    print("\n {0}:{1}\n".format(textString,mainSentiment))   
    sentenceObj = Sentence(textString, mainSentiment)
    sentenceObj = getAlternativeSentences(sentenceObj)
    alternateTweets = (sentenceObj.alternateSentences)[:]
    if(alternateTweets == [] or alternateTweets == None):
        print(" -- No new Strings generated ---\n\n")
        return
    sentenceObj = returnCombinationsOfStrings(sentenceObj)
    


specificString("This is such a good sentence!")


 This is such a good sentence!:0.5411

