## Import Statements

In [40]:
import gensim.downloader as api
import numpy as np
import re
import csv
import pandas as pd
import pprint
import string
import nltk
import sys
import spacy

from IPython.display import HTML
from nltk.corpus import wordnet 
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

pathToDatasets = '../datasets/'
pathToDataScripts = '../datasets/scripts/'
filePath = '../datasets/GoogleNews-vectors-negative300.bin'

sys.path.insert(0, pathToDataScripts)
from cleanDataset import tokenize_words 

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Downloading binaries and models


In [52]:
word_vectors = api.load("glove-wiki-gigaword-100")
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Global Variables and Global Objects

In [87]:
senty = SentimentIntensityAnalyzer()
vocabulary = word_vectors.vocab;

nlp = spacy.load('en')


NUMBER_OF_ALTERNATIVES = 7
TWEET_START = 50
NUM_OF_TWEETS = 10
# VERBOSE_PRINTING = True
VERBOSE_PRINTING = False


punctuation = r"\"#$%&'()+-/:;<=>?@[\]*^_`{|}~"

## Class for Sentences


In [105]:
class Sentence:
    
    ogSentence = []
    
    def __init__(self, sentence, sentiment):
        self.ogSentence = sentence;
        self.ogSentiment = sentiment;
        self.indexToSetOfWords = {}
        self.alternateSentences = [];
        self.alternateHTMLs = [];
        self.finalShiftSentences = [];

    def addAlternativesByIndex(self, index, listOfAlternatives):
        '''
            Adds the list of possible alternative words that 
            can be used per word based on the index of the word in the tokenized 
            sentence. (from cleanAndTokenizeText())
        '''
        if(self.indexToSetOfWords.get(index)):
            self.indexToSetOfWords[index] = self.indexToSetOfWords.union(set(listOfAlternatives))
        else:
            self.indexToSetOfWords[index] = set(listOfAlternatives)
        
    def addAlternativeStrings(self, strings):
        if(isinstance(strings,str)):
            self.alternateStrings = list(set(self.alternateStrings.append(strings)))
            self.alternateSentences.append(strings)
        else:
            self.alternateSentences.extend(strings)
    
    def addHTMLStrings(self, htmls):
        if(isinstance(htmls, str)):
            self.alternateHTMLS.append(htmls)
        else:
            self.alternateHTMLs.extend(htmls)
    
    def addFinalSentences(self, sentences):
        if(isinstance(sentences, str)):
            self.finalShiftSentences.append(sentences)
        else:
            self.finalShiftSentences.extend(sentences)
            

## Utility Code

In [112]:
def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

def cleanAndTokenizeText(text):
    text = text.lower();
    newString = ""
    for char in text:
        if char not in punctuation:
            newString += char
    text = word_tokenize(newString)
    return text;

def getPOSTags(tweet):
#     tags = nltk.pos_tag(tweet)
    tweet = ' '.join(tweet)
    doc = nlp(tweet)
    tags = [(token.text, token.pos_) for token in doc] # since the format expected is [text,tag]
    return tags;
    

def getAntonymsOfWords(word):
    if(word not in vocabulary):
        return []
    setOfAntonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            anton = l.antonyms()
            if(anton!=[]):
                setOfAntonyms.add(anton[0].name())
    if(len(setOfAntonyms) == 0):
        if(VERBOSE_PRINTING): print("No antonyms found for word {0}".format(word))
    return list(setOfAntonyms)

def listReplacements(word):
    if(word not in vocabulary):
        print(" --- {0} not in vocabulary ---".format(word))
        return []
    possibleReplacements = [word[0] for word in word_vectors.most_similar(word,topn=NUMBER_OF_ALTERNATIVES)]
    if(possibleReplacements == []):
        print(" --- No replacements for word {0} ---".format(word))
    antonyms = getAntonymsOfWords(word)
    if(antonyms != []):
        possibleReplacements.extend(antonyms)
        if(VERBOSE_PRINTING): print("Some antonyms for word {0} are {1}".format(word, antonyms[:3]))
        return possibleReplacements
    return possibleReplacements
    
def posApprovedReplacements(alternativeWords, userTokens, indexOfToken):
    if(alternativeWords == []):
        return []
    tempTokens = userTokens[:]
    POSTokens = getPOSTags(tempTokens)
    validWords = []
    
    mainTag = POSTokens[indexOfToken][1]
    mainWord = userTokens[indexOfToken]
    
    for ind,word in enumerate(alternativeWords):
        tempTokens[indexOfToken] = word
        posTags = getPOSTags(tempTokens)
        newTag = (posTags[indexOfToken])[1]
        
        if(str(newTag) == str(mainTag)):
            if(VERBOSE_PRINTING): print("Word {0}[{1}] replaced with {2}[{3}]".format(mainWord, mainTag, word,newTag))
            validWords.append(word)
    return validWords
        
    
    
def getAlternativeSentences(sentenceObj):
    mainSentence = sentenceObj.ogSentence;
    mainSentiment = sentenceObj.ogSentiment;
    
    sentenceTokens = cleanAndTokenizeText(mainSentence)
   
    for ind, word in enumerate(sentenceTokens):
        alternativeStrings = []
        alternativeHTMLs = []
        
        score = senty.polarity_scores(word)['compound']
        copyOfTokens = sentenceTokens[:]
        replacements = []
        if(score != 0.0):
            tempReplacements = listReplacements(word) # get embedding based relations
            if(tempReplacements == []):
                print("No replacements found at all for word {0}".format(word))
                continue
            replacements = posApprovedReplacements(tempReplacements[:], copyOfTokens[:], ind)
            if(replacements == []):
                print(" -- No POS approved words! -- for word {0}\n some non-POS:{1}".format(word, tempReplacements[:4]))
                continue
            sentenceObj.addAlternativesByIndex(ind, replacements)
            
            ## Generate new sentences by switching that word
            for newWord in replacements:
                htmlFriendlyTokens = copyOfTokens[:]
                copyOfTokens[ind] = newWord
                htmlFriendlyTokens[ind] = cstr(" <i>[{0}]</i>".format(newWord), "blue");
                newString = ' '.join(copyOfTokens)
                alternativeStrings.append(newString);
                alternativeHTMLs.append(' '.join(htmlFriendlyTokens))
        sentenceObj.addAlternativeStrings(alternativeStrings)
        sentenceObj.addHTMLStrings(alternativeHTMLs)
    return sentenceObj



## Testing Code


In [90]:
specificWord = "good"
def testOneWord(word=""):
    if(word==""):
        return
    print(word)


In [113]:
specificString = ""
def specificString(textString=""):
    if(textString == "" or textString == None):
        return
    mainSentiment = senty.polarity_scores(textString)['compound']
    if(mainSentiment == 0):
        print("No sentiment found in sentence");
        return;
    print("\n {0}:{1}\n".format(textString,mainSentiment))   
    sentenceObj = Sentence(textString, mainSentiment)
    sentenceObj = getAlternativeSentences(sentenceObj)
    
    alternateTweets = (sentenceObj.alternateSentences)[:]
    if(alternateTweets == [] or alternateTweets == None):
        print(" -- No new Strings generated ---\n\n")
        return
    sentenceObj = shiftSentiment(sentenceObj, True);
    printStrings(sentenceObj)
    sentenceObj = shiftSentiment(sentenceObj, False);
    printStrings(sentenceObj)
    
specificString("You are such a horrible person for looking at this poster")
specificString("You are such a good and nice person for looking at this poster!")
specificString("Thus, lack of relevant papers compromises the efforts and results of this review.")


 You are such a horrible person for looking at this poster:-0.5423




 You are such a good and nice person for looking at this poster!:0.7177




 Thus, lack of relevant papers compromises the efforts and results of this review.:-0.3182



No tweets found when trying to do Positive=True




## Main Cell


In [110]:
def shiftSentiment(sentenceObj, positive=True):
    
    actualTweet = sentenceObj.ogSentence;
    mainSentiment = sentenceObj.ogSentiment;
    alternateTweets = sentenceObj.alternateSentences;
    
    
    happiestTweet = ""
    saddestTweet = ""
    happiestScore = -sys.maxsize - 1
    saddestScore = sys.maxsize
    correctTweets = []
    
    for tweet in alternateTweets:
        newSenty = senty.polarity_scores(tweet)['compound']
        
        if(newSenty < saddestScore):
            saddestTweet = tweet
            saddestScore = newSenty
        if(newSenty > happiestScore):
            happiestTweet = tweet
            happiestScore = newSenty
            
        if(positive == True):
            if(newSenty > mainSentiment):
                correctTweets.append(tweet)
                continue
            
            elif(newSenty < mainSentiment):
                continue
                # Grab happiest tweet and if it's not "", then generate more happy tweets from it
        if(positive == False):
            if(newSenty < mainSentiment):
                correctTweets.append(tweet)
                continue
            
            elif(newSenty > mainSentiment):
                continue
                # grab happiest tweet, and if it's not "", then generate more happy tweets from it
                
    if(correctTweets == []):
        print("\n\nNo tweets found when trying to do Positive={}\n\n".format(positive))
    
    sentenceObj.addFinalSentences(correctTweets);
    return sentenceObj;
        
def printStrings(sentenceObj):
    newStringsHTML = sentenceObj.alternateHTMLs;
    newStrings = sentenceObj.finalShiftSentences;
    mainSentiment = sentenceObj.ogSentiment;    
    for ind,alteredTweet in enumerate(newStrings):
            sentimentOfNewString = senty.polarity_scores(alteredTweet)['compound']
            htmlText = newStringsHTML[ind]
            if(sentimentOfNewString == mainSentiment or sentimentOfNewString == 0):
                display(HTML(cstr("{0}: {1}".format(htmlText,sentimentOfNewString),'DarkGray')))
            elif(sentimentOfNewString > mainSentiment):
                display(HTML(cstr("{0}: {1}".format(htmlText,sentimentOfNewString),'green')))
            else:
                display(HTML(cstr("{0}: {1}".format(htmlText,sentimentOfNewString),'red')))

In [114]:
def runThroughTweets():
    
    tweets_df = pd.read_csv( pathToDatasets + 'cleanedTweets.csv' , nrows=NUM_OF_TWEETS, skiprows=TWEET_START)

    tweets = tweets_df.values

    listOfObjects = []
    for counter,tweet in enumerate(tweets):
        tweet = tweet[0]
        mainSentiment = senty.polarity_scores(tweet)['compound']
        if(mainSentiment == 0):
            continue
        print("\n {0}:{1}\n".format(tweet,mainSentiment))   
        sentenceObj = Sentence(tweet, mainSentiment)
        sentenceObj = getAlternativeSentences(sentenceObj)
        alternateTweets = (sentenceObj.alternateSentences)[:]
        if(alternateTweets == [] or alternateTweets == None):
            print(" -- No new Strings generated ---\n\n")
            continue
#         sentenceObj = shiftSentiment(sentenceObj, True);
#         printStrings(sentenceObj)
        sentenceObj = shiftSentiment(sentenceObj, False);
        printStrings(sentenceObj)
    
runThroughTweets()


 broadband plan a massive broken promise  via  still waiting for broadband we are :-0.2023




  wow tons of replies from you may have to unfollow so i can see my friends tweets you are scrolling the feed a lot. :0.7845




 put vacation photos online a few yrs ago. pc crashed and now i forget the name of the site. :-0.2263



No tweets found when trying to do Positive=False



 i need a hug :0.4767




  not sure what they are only that they are pos! as much as i want to i dont think can trade away company assets sorry andy! :-0.2134




  i hate when that happens... :-0.5719




 i have a sad feeling that dallas is not going to show up  i gotta say though you would think more shows would use music from the game. mmm:-0.3818




 where did u move to  i thought u were already in sd.  hmmm. random u found me. glad to hear yer doing well.:0.6249

