## Import Statements

In [1]:
import gensim.downloader as api
import numpy as np
import re
import csv
import pandas as pd
import pprint
import string
import nltk
import sys
import spacy

from IPython.display import HTML
from nltk.corpus import wordnet 
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

pathToDatasets = '../datasets/'
pathToDataScripts = '../datasets/scripts/'
filePath = '../datasets/GoogleNews-vectors-negative300.bin'

sys.path.insert(0, pathToDataScripts)
from cleanDataset import tokenize_words 


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

Imported clean dataset!


## Downloading binaries and models


In [2]:
print("Should I reload the model?")
tstString = input()
if("no" in tstString.lower()):
    print(" didnt reload model! ")
else:
    print("loading the model!");
    word_vectors = api.load("glove-wiki-gigaword-100")
    nltk.download('vader_lexicon')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')

Should I reload the model?
yes
loading the model!


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Global Variables and Global Objects

In [3]:
senty = SentimentIntensityAnalyzer()
vocabulary = word_vectors.vocab;

nlp = spacy.load('en')


NUMBER_OF_ALTERNATIVES = 5
TWEET_START = 104
NUM_OF_TWEETS = 5


# VERBOSE_PRINTING = True
VERBOSE_PRINTING = False

USE_SPACY = False
# USE_SPACY = True

COLOR_PRINTING = True
#COLOR_PRINTING = False



punctuation = r"\"#$%&'+-/;<=>?@[\]*.^_`{|}~"

## Class for Sentences


In [4]:
from SentenceClass import *

## Utility Code

In [5]:

def printStrings(sentenceObj):
    newStrings = sentenceObj.finalShiftSentences;
    mainSentiment = sentenceObj.ogSentiment;
    for ind, tSentence in enumerate(newStrings):
            alteredTweet = tSentence.getSentence()
            htmlText = tSentence.getHTML()
            sentimentOfNewString = senty.polarity_scores(alteredTweet)['compound']
            if(sentimentOfNewString == mainSentiment or sentimentOfNewString == 0):
                displayText("{0}: {1}".format(htmlText,sentimentOfNewString), 'DarkGray');
            elif(sentimentOfNewString > mainSentiment):
                displayText("{0}: {1}".format(htmlText,sentimentOfNewString),'green')
            else:
                displayText("{0}: {1}".format(htmlText,sentimentOfNewString),'red')

In [6]:
def cstr(s, color='black', italics=False):
    if(COLOR_PRINTING):
        if(italics):
            return cstr("<i>{0}</i>".format(s), color);
        return "<text style=color:{}>{}</text>".format(color, s)
    else:
        return "{}".format(s)

def displayText(text, color='black'):
    if(COLOR_PRINTING):
        display(HTML(cstr(text, color)));
        return
    print("{}".format(text));
    
    
def cleanAndTokenizeText(text):
    text = text.lower();
    newString = ""
    for char in text:
        if char not in punctuation:
            newString += char
    text = word_tokenize(newString)
    return text;

def getPOSTags(tweet):
    if(USE_SPACY == False):
        tags = nltk.pos_tag(tweet)
        return tags;    
    tweet = ' '.join(tweet)
    doc = nlp(tweet)
    tags = [(token.text, token.pos_) for token in doc] # since the format expected is [text,tag]
    return tags;
    

def getAntonymsOfWords(word):
    if(word not in vocabulary):
        return []
    setOfAntonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            anton = l.antonyms()
            if(anton!=[]):
                setOfAntonyms.add(anton[0].name())
    if(len(setOfAntonyms) == 0):
        if(VERBOSE_PRINTING): print("No antonyms found for word {0}".format(word))
    return list(setOfAntonyms)

def listReplacements(word):
    if(word not in vocabulary):
        print(" --- {0} not in vocabulary ---".format(word))
        return []
    possibleReplacements = [word[0] for word in word_vectors.most_similar(word,topn=NUMBER_OF_ALTERNATIVES)]
    if(possibleReplacements == []):
        print(" --- No replacements for word {0} ---".format(word))
    antonyms = getAntonymsOfWords(word)
    if(antonyms != []):
        possibleReplacements.extend(antonyms)
        if(VERBOSE_PRINTING): print("Some antonyms for word {0} are {1}".format(word, antonyms[:3]))
        return possibleReplacements
    return possibleReplacements
    
def posApprovedReplacements(alternativeWords, userTokens, indexOfToken):
    if(alternativeWords == []):
        return []
    tempTokens = userTokens[:]
    POSTokens = getPOSTags(tempTokens)
    validWords = []
    
    mainTag = POSTokens[indexOfToken][1]
    mainWord = userTokens[indexOfToken]
    
    for ind,word in enumerate(alternativeWords):
        tempTokens[indexOfToken] = word
        posTags = getPOSTags(tempTokens)
        newTag = (posTags[indexOfToken])[1]
        
        if(str(newTag) == str(mainTag)):
            if(VERBOSE_PRINTING): print("Word {0}[{1}] replaced with {2}[{3}]".format(mainWord, mainTag, word,newTag))
            validWords.append(word)
    if(validWords == [] and VERBOSE_PRINTING):
        print("No POS words found for word {} with tag {}".format(mainWord, mainTag));
    return validWords
        
    
    
def getAlternativeSentences(sentenceObj):
    mainSentence = sentenceObj.ogSentence;
    mainSentiment = sentenceObj.ogSentiment;

    sentenceTokens = cleanAndTokenizeText(mainSentence)

    for ind, word in enumerate(sentenceTokens):
        alternativeSentenceWithHTML = []

        score = senty.polarity_scores(word)['compound']
        copyOfTokens = sentenceTokens[:]
        replacements = []
        if(score != 0.0):
            tempReplacements = listReplacements(word) # get embedding based relations
            if(tempReplacements == []):
                print("No replacements found at all for word {0}".format(word))
                continue
            replacements = posApprovedReplacements(tempReplacements[:], copyOfTokens[:], ind)
            if(replacements == []):
                print(" -- No POS approved words! -- for word {0}\n some non-POS:{1}".format(word, tempReplacements[:4]))
                continue
            sentenceObj.addAlternativesByIndex(ind, replacements)

            ## Generate new sentences by switching that word
            for newWord in replacements:
                htmlFriendlyTokens = copyOfTokens[:]
                copyOfTokens[ind] = newWord
                htmlFriendlyTokens[ind] = cstr("[{0}]".format(newWord), "blue", italics=True);
                newString = ' '.join(copyOfTokens)
                tSentence = SentenceWithHTML(newString, ' '.join(htmlFriendlyTokens))
                alternativeSentenceWithHTML.append(tSentence)
        sentenceObj.addAlternativeStrings(alternativeSentenceWithHTML)
    return sentenceObj

def shiftSentiment(sentenceObj, positive=True):
    
    actualTweet = sentenceObj.ogSentence;
    mainSentiment = sentenceObj.ogSentiment;
    alternateTweets = sentenceObj.alternateSentences;
    
    
    happiestTweet = ""
    saddestTweet = ""
    happiestScore = -sys.maxsize - 1
    saddestScore = sys.maxsize
    correctTweets = []
    
    for tSentence in alternateTweets:
        tweet = tSentence.getSentence();
        newSenty = senty.polarity_scores(tweet)['compound']
        
        if(newSenty < saddestScore):
            saddestTweet = tSentence
            saddestScore = newSenty
        if(newSenty > happiestScore):
            happiestTweet = tSentence
            happiestScore = newSenty
            
        if(newSenty == mainSentiment):
            continue
        if(positive == True):
            if(newSenty > mainSentiment):
                correctTweets.append(tSentence)
                continue
            
            elif(newSenty < mainSentiment):
                continue
                # Grab happiest tweet and if it's not "", then generate more happy tweets from it
        if(positive == False):
            if(newSenty < mainSentiment):
                correctTweets.append(tSentence)
                continue
            
            elif(newSenty > mainSentiment):
                continue
                # grab happiest tweet, and if it's not "", then generate more happy tweets from it
                
    if(correctTweets == []):
        print("\n\nNo tweets found when trying to do Positive={}\n\n".format(positive))
    sentenceObj.resetFinalSentences();
    sentenceObj.addFinalSentences(correctTweets);
    return sentenceObj;


def helper_Combine(mainList, myList):
    newList = []
    for val in myList:
        for mainVal in mainList:
            if(VERBOSE_PRINTING): print("Combining {0} with {1}".format(' '.join(val), ' '.join(mainVal)));
            newList.append(val + mainVal);
    return newList;


def combineSentenceChunks(wholeSentence, dictOfChunks):
    reversedKeys = sorted(dictOfChunks.keys(), reverse=True)
    completeSentences = [];
    mainArr = dictOfChunks[reversedKeys[0]]
    for ind in range(1, len(reversedKeys)):
        key = reversedKeys[ind];
        mainArr = helper_Combine(mainArr, dictOfChunks[key]);
        
    return mainArr;
    
        
def generateSentenceChunks(wholeSentence, keyToChange, nextKey, listOfMyAlternatives):
    newList = list(listOfMyAlternatives)
    newList.append(wholeSentence[keyToChange]);
    generatedSentences = []
    generatedSentenceObjects = []
    for myAlt in newList:
        newSentence = wholeSentence[:]
        newSentence[keyToChange] = myAlt
        htmlSentence = newSentence[:]
        htmlSentence[keyToChange] = cstr("[{0}]".format(newWord), "blue", italics=True);
        if(VERBOSE_PRINTING): print("Generated : {}".format(newSentence[keyToChange:nextKey]))
        generatedSentences.append(newSentence[keyToChange:nextKey]);
        generatedSentenceObjects.append(SentenceWithHTML(' '.join(htmlSentence),' '.join(newSentence[keyToChange:nextKey])))
        
        
    return generatedSentences, generatedSentenceObjects
    
    
def printAllPossibleStrings(sentenceObj):
    indexToWordDict = sentenceObj.indexToSetOfWords;
    originalSentence = sentenceObj.ogSentence;
    tokenizedSentence = cleanAndTokenizeText(originalSentence)
    reversedKeys = sorted(indexToWordDict.keys(), reverse=True)
    dictAlternatives  = {}

    keys = sorted(indexToWordDict.keys())
    sentenceChunks = {}
    htmlChunks = {}
    print("Keys : {0}".format(keys))
    
    for ind in range(0,len(keys)):
        key = keys[ind]
        nextKey = keys[ind+1] if ind+1 < len(keys) else len(tokenizedSentence)
        sentenceChunks[key], htmlChunks[key] = generateSentenceChunks(tokenizedSentence, key, nextKey, indexToWordDict[key])
    
    if(keys[0] != 0):
        newList = []
        for thislist in sentenceChunks[keys[0]]:
            newList.append(tokenizedSentence[0:keys[0]] + thislist)
        sentenceChunks[keys[0]] = newList;
        
    print("Sentence Chunks generated!")
        
        
    finalOptions = combineSentenceChunks(tokenizedSentence, sentenceChunks)
    
    
    for val in finalOptions:
        sentence = ' '.join(val)
        score = senty.polarity_scores(sentence)['compound']
        if(score == 0.0):
            continue
        print("{} : {}".format(sentence, score))
        
#     for val in totalList:
#         print(' '.join(val))


In [7]:
specificString = ""
def specificString(textString=""):
    if(textString == "" or textString == None):
        return
    mainSentiment = senty.polarity_scores(textString)['compound']
    if(mainSentiment == 0):
        print("{} \n No sentiment found in sentence".format(textString));
        return;
    print("\n {0}:{1}\n".format(textString,mainSentiment))   
    sentenceObj = Sentence(textString, mainSentiment)
    sentenceObj = getAlternativeSentences(sentenceObj)
    alternateTweets = (sentenceObj.alternateSentences)[:]
    if(alternateTweets == [] or alternateTweets == None):
        print(" -- No new Strings generated ---\n\n")
        return
    printStrings(sentenceObj)
    printAllPossibleStrings(sentenceObj)

# specificString("I really hate hot chocolate, but i enjoy hot coffee, and I'm serious")
specificString("Keep calm and carry on debugging till it works");
# specificString("would rather the first party send bad messages than the 3rd party send mixed ones  sophmore year all over again")


 Keep calm and carry on debugging till it works:0.3182

Keys : [1]


NameError: name 'newWord' is not defined

In [None]:
def runThroughTweets():
    
    tweets_df = pd.read_csv( pathToDatasets + 'cleanedTweets.csv' , nrows=NUM_OF_TWEETS, skiprows=TWEET_START)

    tweets = tweets_df.values

    listOfObjects = []
    for counter,tweet in enumerate(tweets):
        tweet = tweet[0]
        mainSentiment = senty.polarity_scores(tweet)['compound']
        if(mainSentiment == 0):
            continue
        print("\n {0}:{1}\n".format(tweet,mainSentiment))   
        sentenceObj = Sentence(tweet, mainSentiment)
        sentenceObj = getAlternativeSentences(sentenceObj)
        alternateTweets = (sentenceObj.alternateSentences)[:]
        if(alternateTweets == [] or alternateTweets == None):
            print(" -- No new Strings generated ---\n\n")
            continue
        sentenceObj = shiftSentiment(sentenceObj, True);
        printStrings(sentenceObj)
        sentenceObj = shiftSentiment(sentenceObj, False);
        printStrings(sentenceObj)
        printAllPossibleStrings(sentenceObj)
    
runThroughTweets()

## Testing Code


In [None]:
specificWord = "good"
def testOneWord(word=""):
    if(word==""):
        return
    print(word)

## Main Cell
