In [1]:
import gensim
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
import numpy as np
import re
import csv

import pandas as pd
import pprint

import string
import nltk


from IPython.display import HTML


In [2]:
pathToDatasets = '../datasets/'
filePath = '../datasets/GoogleNews-vectors-negative300.bin'
word_vectors = api.load("glove-wiki-gigaword-100")
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize



senty = SentimentIntensityAnalyzer()
vocabulary = word_vectors.vocab;


NUMBER_OF_ALTERNATIVES = 5
TWEET_START = 0
NUM_OF_TWEETS = 10

In [4]:
def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

## Utility Code

In [5]:
def cleanAndTokenizeText(text):
    text = text.lower()
    newString = ""
    for char in text:
        if char not in string.punctuation:
            newString += char
    text = word_tokenize(newString)
    return text;

def listReplacements(word):
    if(word not in vocabulary):
        return []
    possibleReplacements = [word[0] for word in word_vectors.most_similar(word,topn=NUMBER_OF_ALTERNATIVES)]
    return possibleReplacements

def getPOSTags(tweet):
    tags = nltk.pos_tag(tweet)
    return(tags)

## NLP Utility Code

In [6]:
def listReplacements(word):
    if(word not in vocabulary):
        return []
    possibleReplacements = [word[0] for word in word_vectors.most_similar(word,topn=NUMBER_OF_ALTERNATIVES)]
    return possibleReplacements


def posApprovedReplacements(alternativeWords, userTokens, indexOfToken):
    if(alternativeWords == []):
        return []
    tempTokens = userTokens[:]
    truePOSTokens = getPOSTags(tempTokens)
    validWords = []
    
    mainTag = truePOSTokens[indexOfToken][1]
    mainWord = userTokens[indexOfToken]
    
    for ind,word in enumerate(alternativeWords):
        tempTokens[indexOfToken] = word
        posTags = getPOSTags(tempTokens)
        newTag = (posTags[indexOfToken])[1]

        if(str(newTag) == str(mainTag)):
            print("Word {0}[{1}] replaced with {2}[{3}]".format(mainWord, mainTag, word,newTag))
            validWords.append(word)
    return validWords
        

In [7]:
def getAlternativeSentences(tweet, sentimentOfTweet):
    userInputTokens = cleanAndTokenizeText(tweet)
    
   
    alternativeStrings = []
    for ind,word in enumerate(userInputTokens):
        
        score = senty.polarity_scores(word)['compound'] # get the aggregated score!
        newUserTokens = userInputTokens[:]
        
        if(score != 0.0):
            replacements = listReplacements(word)       
            replacements = posApprovedReplacements(replacements[:], newUserTokens[:], ind)
            if(replacements == []):
                continue          
            print("Word changed: {0}".format(word));
            for newWord in replacements:
                
                newUserTokens[ind] = cstr(" <i>[{0}]</i> ".format(newWord), 'blue');
                newString = ' '.join(newUserTokens)
                sentimentOfNewString = senty.polarity_scores(newString)['compound']
             
            
                if(sentimentOfNewString == 0):
                    continue
                
                alternativeStrings.append(newString)
    return alternativeStrings;

## Main Cells

In [18]:
def runThroughTweets():

    tweets_df = pd.read_csv( pathToDatasets + 'cleanedTweets.csv' , nrows=NUM_OF_TWEETS, skiprows=TWEET_START)

    tweets = tweets_df.values

    
    for counter,tweet in enumerate(tweets):
        tweet = tweet[0]
        mainSentiment = senty.polarity_scores(tweet)['compound']
        if(mainSentiment == 0):
            continue
       
        newStrings = getAlternativeSentences(tweet, mainSentiment)
        if(newStrings == [] or newStrings == None):
            continue
        print("\nNUM({2}) {0}:{1}\n".format(tweet,mainSentiment, counter))
        for alteredTweet in newStrings:
            sentimentOfNewString = senty.polarity_scores(alteredTweet)['compound']
            if((sentimentOfNewString) >= (mainSentiment)):
                display(HTML(cstr("{0}: {1}".format(alteredTweet,sentimentOfNewString),'green')))
            else:
                display(HTML(cstr("{0}: {1}".format(alteredTweet,sentimentOfNewString),'red')))
    
    
runThroughTweets()

  - awww that's a bummer.  you shoulda got david carr of third day to do it. ;d
Word bummer[NN] replaced with copout[NN]
Word bummer[NN] replaced with crapshoot[NN]
Word bummer[NN] replaced with stinker[NN]
Word bummer[NN] replaced with misnomer[NN]
Word bummer[NN] replaced with hogwash[NN]
Word changed: bummer
is upset that he can't update his facebook by texting it... and might cry as a result  school today also. blah!
Word upset[JJ] replaced with surprised[JJ]
Word changed: upset
Word cry[VB] replaced with shout[VB]
Word cry[VB] replaced with scream[VB]
Word changed: cry
Word blah[VBD] replaced with moo[VBD]
Word blah[VBD] replaced with goy[VBD]
Word blah[VBD] replaced with huh[VBD]
Word blah[VBD] replaced with doo[VBD]
Word changed: blah

NUM(1) is upset that he can't update his facebook by texting it... and might cry as a result  school today also. blah!:-0.75



 i dived many times for the ball. managed to save 50%  the rest go out of bounds
Word save[VB] replaced with saved[VB]
Word save[VB] replaced with help[VB]
Word save[VB] replaced with put[VB]
Word changed: save
my whole body feels itchy and like its on fire 
Word itchy[VBP] replaced with scratchy[VBP]
Word itchy[VBP] replaced with droopy[VBP]
Word changed: itchy
Word fire[NN] replaced with firing[NN]
Word fire[NN] replaced with attack[NN]
Word fire[NN] replaced with explosion[NN]
Word changed: fire

NUM(3) my whole body feels itchy and like its on fire :-0.25



 no it's not behaving at all. i'm mad. why am i here? because i can't see you all over there. 
Word no[DT] replaced with any[DT]
Word changed: no
Word mad[JJ] replaced with crazy[JJ]
Word changed: mad

NUM(4)  no it's not behaving at all. i'm mad. why am i here? because i can't see you all over there. :-0.6597



 not the whole crew 
need a hug 
Word hug[NN] replaced with hugs[NN]
Word hug[NN] replaced with kiss[NN]
Word hug[NN] replaced with hugged[NN]
Word hug[NN] replaced with goodbye[NN]
Word changed: hug
 hey  long time no see! yes.. rains a bit only a bit  lol  i'm fine thanks  how's you ?
Word no[DT] replaced with any[DT]
Word changed: no
Word yes[RB] replaced with yeah[RB]
Word yes[RB] replaced with i[RB]
Word changed: yes
Word lol[JJ] replaced with coxhill[JJ]
Word lol[JJ] replaced with bellowed[JJ]
Word lol[JJ] replaced with kaboom[JJ]
Word lol[JJ] replaced with shouts[JJ]
Word changed: lol
Word fine[JJ] replaced with enough[JJ]
Word fine[JJ] replaced with good[JJ]
Word changed: fine

NUM(7)  hey  long time no see! yes.. rains a bit only a bit  lol  i'm fine thanks  how's you ?:0.68



 nope they didn't have it 
 que me muera ? 
