### Data Import and Package Setup

In [16]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import bigrams, trigrams
import json
from pprint import pprint
import os

### Initialization

In [17]:
# load json data
dir_path = os.path.dirname(os.path.realpath(""))
with open(dir_path + '/eecs337-w2019-group19/data/gg2013.json') as f:
    data = json.load(f)

stopWords = set(stopwords.words('english'))
twitterwords = {"http", "rt", "goldenglobes", "golden", "globes", "RT", "Golden", "Globes", "GoldenGlobes"}
stopWords = stopWords.union(twitterwords)
keyword = ['present', 'presents', 'presented', 'presenting', 'presenter', 'presenters', 
           'introduce', 'introducing', 'introduces', 'introduce']

### Combined tweets

In [18]:
def contains(a, b):
    a_set = set(a)
    b_set = set(b)
    if len(a_set.intersection(b_set)) > 0: 
        return True
    else:
        return False   

def tweetTokenContain(dataset, keywords, stopwords):
    tweetnumber = []
    tweetlist = []
    i = 0

    for i in range(len(dataset) - 1):
        tweettextToken = []
        words = word_tokenize(dataset[i]['text'])
        words_clean = [token for token in words if token not in stopwords and token.isalpha()]
        
        if contains(words_clean, keywords) == True:
            tweetnumber.append(i)
            tweetlist.append(words_clean)
        
        i += 1
        
    return([tweetnumber, tweetlist])

In [19]:
tweet_present_tokenized = tweetTokenContain(data, keyword, stopWords)
tct = tweet_present_tokenized[0]
tweet_present_tokenized_list = tweet_present_tokenized[1]

In [20]:
def tweetPrint(dataset, tweetnumber):
    return [dataset[i]['text'] for i in tweetnumber]

tweet_present = tweetPrint(data, tct)
len(tweet_present)

1556

In [21]:
def bigramsDict(tweetlist):

    common_bigrams = {}
    
    for sentence in tweetlist:
        bigrm_in_tweet = list(bigrams(sentence))
        for bg in bigrm_in_tweet:
            if bg in common_bigrams.keys():
                common_bigrams[bg] += 1
            else:
                common_bigrams[bg] = 1
    
    d_view = [ (v,k) for k,v in common_bigrams.items() ]
    return d_view

common_bigrams_dict = bigramsDict(tweet_present_tokenized_list)

### Show results

In [22]:
def printDictbyCount(dict_view, maximum = 30):
    d_view.sort(reverse=True) # natively sort tuples by first element
    counter = 0
    
    for v,k in d_view:
        pprint("%s: %d" % (k,v))
        counter+=1

        if counter > maximum:
            break

printDictbyCount(common_bigrams_dict)

"('Bill', 'Clinton'): 269"
"('present', 'award'): 122"
"('saw', 'present'): 97"
"('KevalBaxi', 'saw'): 97"
"('Alyssaeinman', 'KevalBaxi'): 97"
"('introduces', 'Lincoln'): 59"
"('Clinton', 'introduces'): 59"
"('Clinton', 'introduce'): 57"
"('introducing', 'Lincoln'): 50"
"('Kristen', 'Wiig'): 48"
"('Robert', 'Pattinson'): 45"
"('Mitt', 'Romney'): 45"
"('Will', 'Ferrell'): 44"
"('Romney', 'introduce'): 44"
"('Les', 'Miserables'): 44"
"('Clinton', 'introducing'): 44"
"('President', 'Bill'): 43"
"('win', 'Oscar'): 42"
"('making', 'impossible'): 42"
"('impossible', 'win'): 42"
"('Now', 'Mitt'): 42"
"('Lincoln', 'making'): 42"
"('LOLGOP', 'Now'): 42"
"('introduce', 'Les'): 41"
"('President', 'Clinton'): 41"
"('Jodie', 'Foster'): 41"
"('politicoroger', 'Bill'): 40"
"('Clinton', 'presenting'): 39"
"('standing', 'ovation'): 37"
"('introduce', 'Lincoln'): 35"
"('award', 'presenting'): 34"


In [23]:
def checkFullNameFormat(inputTuple2):
    if inputTuple2[0][0].isupper() == True and inputTuple2[1][0].isupper() == True:
        return True
    else:
        return False

In [24]:
def extractPseudoNames(input_dict, lowbound = 10):

    pseudo_namelist = []
    
    for wordpair in input_dict:
        if wordpair[0] >= lowbound and checkFullNameFormat(wordpair[1]) == True:
            name_like = wordpair[1][0].lower() + " " + wordpair[1][1].lower()
            pseudo_namelist.append(name_like)
        
    return(pseudo_namelist)

In [25]:
pseudo_namelist = extractPseudoNames(common_bigrams_dict, lowbound = 10)
pseudo_namelist

['george clooney',
 'christian bale',
 'tina fey',
 'amy poehler',
 'bradley cooper',
 'kate hudson',
 'motion picture',
 'maggie smith',
 'cnnshowbiz best',
 'smith downton',
 'downton abbey',
 'abbey she',
 'les miserables',
 'robert pattinson',
 'amy tina',
 'salma hayek',
 'aduralde who',
 'mel gibson',
 'president bill',
 'bill clinton',
 'best picture',
 'huffpostent bill',
 'president clinton',
 'united states',
 'states bill',
 'mitt romney',
 'cnnshowbiz president',
 'will ferrell',
 'kristen wiig',
 'wiig will',
 'will ferrel',
 'lolgop now',
 'now mitt',
 'ferrell kristen',
 'president united',
 'tommy lee',
 'lee jones',
 'wow now',
 'it president',
 'robert downey',
 'downey jr',
 'stallone schwarzenegger',
 'sylvester stallone',
 'arnold schwarzenegger',
 'best foreign',
 'cecil demille',
 'jodie foster',
 'alyssaeinman kevalbaxi',
 'notbillwalton batman',
 'wolverine globe',
 'globe a',
 'julia roberts']

### Main method of understanding sentences

In [26]:
# Method 1: Co-occurrence of names and keywords
def cooccur_tweets(names, keywordlist):
    tweetnumber = []
    i = 0

    for i in range(len(data) - 1):
        tweetsentence = data[i]['text'].lower()
        for name in names:
            tweetsplit = tweetsentence.split(name)
            if len(tweetsplit) > 1:
                for part in tweetsplit:
                    if len(part) > 1:
                        parttoken = word_tokenize(part)
                        if contains(parttoken, keywordlist) == True:
                            tweetnumber.append(i)
        i += 1
    
    return(tweetnumber)

# Method 2: Appearance of keywords immediately or 1 word after names (Subject + Verb structure)
def subjverb_tweets(names, keywordlist):
    tweetnumber = []
    i = 0

    for i in range(len(data) - 1):
        tweetsentence = data[i]['text'].lower()
        for name in names:
            tweetsplit = tweetsentence.split(name)
            if len(tweetsplit) > 1:
                for part in tweetsplit:
                    if len(part) > 1:
                        parttoken = word_tokenize(part)
                        if len(parttoken) > 1:
                            immediateWords = [parttoken[0], parttoken[1]]
                        else:
                            immediateWords = [parttoken[0]]
                        
                        if contains(immediateWords, keywordlist) == True:
                            tweetnumber.append(i)
        i += 1
    
    return(tweetnumber)

In [27]:
#coc = cooccur_tweets(pseudo_namelist, keyword)
subv = subjverb_tweets(pseudo_namelist, keyword)

In [28]:
tweetPrint(data, subv)

['Bradley Cooper and Kate Hudson present the nominees for Best Supporting Actor in a Motion Picture. :-) #GoldenGlobes',
 'Bradley Cooper and Kate Hudson present Christoph Waltz with Best Supporting Actor #GoldenGlobes',
 'Kate Hudson presenting an award, looking AMAZING and knowing it, was just about the sexiest thing ever. #GoldenGlobes',
 "RT @usweekly: when was the last time downton abby's maggie smith was present in Hollywood to accept an award? #goldenglobes",
 "RT @usweekly: when was the last time downton abby's maggie smith was present in Hollywood to accept an award? #goldenglobes",
 "RT @usweekly: when was the last time downton abby's maggie smith was present in Hollywood to accept an award? #goldenglobes",
 'RT @ADuralde: Who needs writers, when you can just have Salma Hayek introduce with, "Something about the best, eh...."? #goldenglobes',
 'RT @ADuralde: Who needs writers, when you can just have Salma Hayek introduce with, "Something about the best, eh...."? #goldenglobes

### Test if presenters look right

In [29]:
presenterData = [data[i] for i in subv]
tweet_presenter_tokenized = tweetTokenContain(presenterData, keyword, stopWords)
presenter_tokenized_list = tweet_presenter_tokenized[1]
pr_common_bigrams = bigramsDict(presenter_tokenized_list)
extractPseudoNames(pr_common_bigrams, lowbound = 10)

['aduralde who',
 'salma hayek',
 'president bill',
 'bill clinton',
 'president clinton',
 'kristen wiig',
 'wiig will',
 'will ferrell',
 'lolgop now',
 'now mitt',
 'mitt romney',
 'les miserables',
 'ferrell kristen',
 'spielberg president',
 'is lincoln',
 'mel gibson',
 'robert pattinson',
 'robert downey',
 'downey jr',
 'jodie foster',
 'christian bale',
 'pics robert']