### Data Import and Package Setup

In [3]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import bigrams, trigrams
import json
from pprint import pprint
import os

### Initialization

In [4]:
# load json data
dir_path = os.path.dirname(os.path.realpath(""))
with open(dir_path + '/eecs337-w2019-group19/data/gg2013.json') as f:
    data = json.load(f)

stopWords = set(stopwords.words('english'))
twitterwords = {"http", "rt", "goldenglobes", "golden", "globes", "RT", "Golden", "Globes", "GoldenGlobes"}
stopWords = stopWords.union(twitterwords)
keyword = ['present', 'presenting', 'presented', 'presents', 'presenter', 'presenters', 
           'introduce', 'introducing', 'introduces', 'introduced',
           'announce', 'announcing', 'announces', 'announces']

### Combined tweets

In [5]:
def contains(a, b):
    a_set = set(a)
    b_set = set(b)
    if len(a_set.intersection(b_set)) > 0: 
        return True
    else:
        return False   

def tweetTokenContain(dataset, keywords, stopwords):
    tweetnumber = []
    tweetlist = []
    i = 0

    for i in range(len(dataset) - 1):
        tweettextToken = []
        words = word_tokenize(dataset[i]['text'])
        words_clean = [token for token in words if token not in stopwords and token.isalpha()]
        
        if contains(words_clean, keywords) == True:
            tweetnumber.append(i)
            tweetlist.append(words_clean)
        
        i += 1
        
    return([tweetnumber, tweetlist])

In [6]:
tweet_present_tokenized = tweetTokenContain(data, keyword, stopWords)
tct = tweet_present_tokenized[0]
tweet_present_tokenized_list = tweet_present_tokenized[1]

In [7]:
def tweetPrint(dataset, tweetnumber):
    return [dataset[i]['text'] for i in tweetnumber]

tweet_present = tweetPrint(data, tct)
len(tweet_present)

1759

In [8]:
def bigramsDict(tweetlist):

    common_bigrams = {}
    
    for sentence in tweetlist:
        bigrm_in_tweet = list(bigrams(sentence))
        for bg in bigrm_in_tweet:
            if bg in common_bigrams.keys():
                common_bigrams[bg] += 1
            else:
                common_bigrams[bg] = 1
    
    d_view = [ (v,k) for k,v in common_bigrams.items() ]
    return d_view

common_bigrams_dict = bigramsDict(tweet_present_tokenized_list)

In [9]:
def trigramsDict(tweetlist):

    common_trigrams = {}
    
    for sentence in tweetlist:
        trigrm_in_tweet = list(trigrams(sentence))
        for trg in trigrm_in_tweet:
            if trg in common_trigrams.keys():
                common_trigrams[trg] += 1
            else:
                common_trigrams[trg] = 1
    
    d_view = [ (v,k) for k,v in common_trigrams.items() ]
    return d_view

common_trigrams_dict = trigramsDict(tweet_present_tokenized_list)

### Show results

In [10]:
def printDictbyCount(dict_view, maximum = 30):
    dict_view.sort(reverse=True) # natively sort tuples by first element
    counter = 0
    
    for v,k in dict_view:
        pprint("%s: %d" % (k,v))
        counter+=1

        if counter > maximum:
            break

printDictbyCount(common_bigrams_dict)

"('Bill', 'Clinton'): 297"
"('present', 'award'): 122"
"('saw', 'present'): 97"
"('KevalBaxi', 'saw'): 97"
"('Alyssaeinman', 'KevalBaxi'): 97"
"('Jodie', 'Foster'): 73"
"('introduces', 'Lincoln'): 59"
"('Clinton', 'introduces'): 59"
"('Clinton', 'introduce'): 57"
"('Robert', 'Pattinson'): 53"
"('introducing', 'Lincoln'): 50"
"('Kristen', 'Wiig'): 49"
"('Will', 'Ferrell'): 45"
"('Mitt', 'Romney'): 45"
"('Romney', 'introduce'): 44"
"('President', 'Bill'): 44"
"('Les', 'Miserables'): 44"
"('Clinton', 'introducing'): 44"
"('win', 'Oscar'): 42"
"('making', 'impossible'): 42"
"('impossible', 'win'): 42"
"('Now', 'Mitt'): 42"
"('Lincoln', 'making'): 42"
"('LOLGOP', 'Now'): 42"
"('introduce', 'Les'): 41"
"('President', 'Clinton'): 41"
"('politicoroger', 'Bill'): 40"
"('standing', 'ovation'): 39"
"('Clinton', 'presenting'): 39"
"('Amy', 'Poehler'): 39"
"('introduce', 'Lincoln'): 35"


In [11]:
def checkFullNameFormat(inputTupleN):
    sizeInput = len(inputTupleN)
    if sizeInput == 2:
        if inputTupleN[0][0].isupper() == True and inputTupleN[1][0].isupper() == True:
            return True
    elif sizeInput == 3:
        if inputTupleN[0][0].isupper() == True and inputTupleN[1][0].isupper() == True and inputTupleN[1][0].isupper() == True:
            return True
    else:
        return False

In [18]:
def extractPseudoNames(input_dict, lowbound = 5):

    pseudo_namelist = []
    
    for wordpair in input_dict:
        if wordpair[0] >= lowbound and checkFullNameFormat(wordpair[1]) == True:
            name_like = wordpair[1][0].lower() + " " + wordpair[1][1].lower()
            pseudo_namelist.append(name_like)
        
    return(pseudo_namelist)

def extractPseudoNamesTri(input_dict, lowbound = 5):

    pseudo_namelist = []
    
    for wordpair in input_dict:
        if wordpair[0] >= lowbound and checkFullNameFormat(wordpair[1]) == True:
            name_like = wordpair[1][0].lower() + " " + wordpair[1][1].lower() + " " + wordpair[1][2].lower()
            pseudo_namelist.append(name_like)
        
    return(pseudo_namelist)

In [13]:
def createPossibleTrigrams(bigram_text_list):
    trigramsList = []
    
    for x in bigram_text_list:
        for y in bigram_text_list:
            x_split = x.split(" ")
            #print(x_split)
            y_split = y.split(" ")
            if (x_split[1] == y_split[0]) and (x_split[0] != y_split[1]):
                trigram = x + " " + y_split[1]
                trigramsList.append(trigram)
            elif (x_split[0] == y_split[1]) and (x_split[1] != y_split[0]):
                trigram = y_split[0] + " " + x
                trigramsList.append(trigram)
    
    return set(trigramsList)

In [14]:
pseudo_namelist = extractPseudoNames(common_bigrams_dict, lowbound = 5)
pseudo_namelist

['bill clinton',
 'alyssaeinman kevalbaxi',
 'jodie foster',
 'robert pattinson',
 'kristen wiig',
 'will ferrell',
 'mitt romney',
 'president bill',
 'les miserables',
 'now mitt',
 'lolgop now',
 'president clinton',
 'amy poehler',
 'tina fey',
 'robert downey',
 'best picture',
 'wolverine globe',
 'notbillwalton batman',
 'globe a',
 'christian bale',
 'united states',
 'wiig will',
 'president united',
 'cnnshowbiz president',
 'wow now',
 'poehler tina',
 'it president',
 'states bill',
 'maggie smith',
 'george clooney',
 'downton abbey',
 'smith downton',
 'jeremy renner',
 'abbey she',
 'salma hayek',
 'cnnshowbiz best',
 'vanityfair amy',
 'tommy lee',
 'lee jones',
 'julia roberts',
 'best motion',
 'stallone schwarzenegger',
 'salmon fishing',
 'mel gibson',
 'huffpostent bill',
 'ferrell kristen',
 'motion picture',
 'fishing yemen',
 'best foreign',
 'arnold schwarzenegger',
 'amanda seyfried',
 'yahootv jeremy',
 'gretel as',
 'downey jr',
 'sylvester stallone',
 'quen

In [25]:
ptg = createPossibleTrigrams(pseudo_namelist)

In [30]:
pseudo_namelist2 = extractPseudoNamesTri(common_trigrams_dict, lowbound = 5)
nameset = set(pseudo_namelist2)
nameset.intersection(ptg)

{'achievement award lindsay',
 'added lq video',
 'amy poehler tina',
 'award lindsay lohan',
 'best actress tv',
 'best foreign film',
 'best motion picture',
 'best screenplay djangounchained',
 'bradley cooper kate',
 'christian bale i',
 'cnnshowbiz best supporting',
 'cnnshowbiz president clinton',
 'cooper kate hudson',
 'downton abbey she',
 'drop dead gorg',
 'ferrell kristen wiig',
 'fey amy poehler',
 'george w bush',
 'huffpostent bill clinton',
 'i christian bale',
 'if i christian',
 'it president united',
 'jewamerprincess omg is',
 'joshmalina umm republican',
 'kristen wiig will',
 'kristin wiig will',
 'latimesent robert pattinson',
 'lifetime achievement award',
 'lincolnmovie best motion',
 'lincolnmovie bill clinton',
 'lolgop now mitt',
 'lq video rob',
 'maggie smith downton',
 'motion picture drama',
 'now mitt romney',
 'omg is bill',
 'pagetopremiere robert pattinson',
 'pattinson amanda seyfried',
 'pics robert pattinson',
 'poehler tina fey',
 'president bill

Missing from names: Jennifer Garner, Megan Fox, Jonah Hill, Jimmy Fallon, Kiefer Sutherland

Missing from tweets: Halle Berry, Sacha Baron Cohen, Salma Hayek, Paul Rudd, Don Cheadle, Eva Longoria, Jessica Alba

Confusing names: Jennifer Lawrence vs. JLo

No info: Lucy Liu, Debra Messing, Aziz Ansari, Jason Bateman, Kristen Bell, John Krasinski

### Main method of understanding sentences

In [31]:
# Method 1: Co-occurrence of names and keywords
def cooccur_tweets(names, keywordlist):
    tweetnumber = []
    i = 0

    for i in range(len(data) - 1):
        tweetsentence = data[i]['text'].lower()
        for name in names:
            tweetsplit = tweetsentence.split(name)
            if len(tweetsplit) > 1:
                for part in tweetsplit:
                    if len(part) > 1:
                        parttoken = word_tokenize(part)
                        if contains(parttoken, keywordlist) == True:
                            tweetnumber.append(i)
        i += 1
    
    return(tweetnumber)

# Method 2: Appearance of keywords immediately or 1 word after names (Subject + Verb structure)
def subjverb_tweets(names, keywordlist):
    tweetnumber = []
    i = 0

    for i in range(len(data) - 1):
        tweetsentence = data[i]['text'].lower()
        for name in names:
            tweetsplit = tweetsentence.split(name)
            if len(tweetsplit) > 1:
                for part in tweetsplit:
                    if len(part) > 1:
                        parttoken = word_tokenize(part)
                        if len(parttoken) > 1:
                            immediateWords = [parttoken[0], parttoken[1]]
                        else:
                            immediateWords = [parttoken[0]]
                        
                        if contains(immediateWords, keywordlist) == True:
                            tweetnumber.append(i)
        i += 1
    
    return(tweetnumber)

In [32]:
#coc = cooccur_tweets(pseudo_namelist, keyword)
subv = subjverb_tweets(pseudo_namelist, keyword)

In [33]:
#tweetPrint(data, subv)

### Test if presenters look right

In [34]:
presenterData = [data[i] for i in subv]
tweet_presenter_tokenized = tweetTokenContain(presenterData, keyword, stopWords)
presenter_tokenized_list = tweet_presenter_tokenized[1]
pr_common_bigrams = bigramsDict(presenter_tokenized_list)
presenters = extractPseudoNames(pr_common_bigrams, lowbound = 10)

In [35]:
from classifier import *

In [36]:
tweet_dic = get_and_classify_tweets('./data/gg2013.json', 1000000, gg2013_categories)

Initializing database...
Database initialized with 174643 tweets.
Parsing awards...
Classifying tweets...
1000 tweets classified...
2000 tweets classified...
3000 tweets classified...
4000 tweets classified...
5000 tweets classified...
6000 tweets classified...
7000 tweets classified...
8000 tweets classified...
9000 tweets classified...
10000 tweets classified...
11000 tweets classified...


In [37]:
tweet_dic[u'Best Performance by an Actor in a Supporting Role in a Series, Limited Series or Motion Picture Made for Television']

['Best actor in a TV Series: Damian Lewis for Homeland #GoldenGlobes',
 'Winner Actor TV Series: Damien Lewis #GoldenGlobes #joachimgg',
 'Damian Lewis won Best Actor for a Television Series #Goldenglobes #Congrats 😎👍❤🎬🎥🎭',
 'Best actor Damian Lewis for Homeland. Series.  #GoldenGlobes',
 'Damien Lewis wins Best Actor in a TV Series for his role in Homeland #GoldenGlobes',
 "#GoldenGlobes Best Actor, TV series: Homeland's Damian Lewis - after his Emmy win, he's on a roll with this role *insert lame drum kick*",
 'And the award for Best Actor in TV series goes to Damian Lewis from homeland he deserves it.  #GoldenGlobes',
 "#GoldenGlobes Best Actor, TV series: Homeland's Damian Lewis!! YES!!!",
 'Damian Lewis, best series actor :D #GoldenGlobes',
 "RT @NadiaNeophytou: #GoldenGlobes Best Actor, TV series: Homeland's Damian Lewis - after his Emmy win, he's on a roll with this role *insert lame drum kick*",
 "RT @MTLDriveFor25: Bryan Cranston seriously didn't win best actor in a TV series?

In [38]:
awardcat = tweet_dic.keys()

In [39]:
#awardcat

In [40]:
def tweetTextContain(tweetTextList, keywords):
    tweetnumber = set([])
    i = 0

    for i in range(len(tweetTextList) - 1):
        
        for keyword in keywords:
            if keyword in tweetTextList[i].lower():
                tweetnumber = tweetnumber.union([i])
        
        i += 1
        
    return list(tweetnumber)

In [67]:
def subjverb_tweets(tweetdata, names, keywordlist):
    nameslist = []
    additionalNamesParsing = []
    i = 0

    for i in range(len(tweetdata)):
        tweetsentence = "OT: " + tweetdata[i].lower()
        for name in names:
            tweetsplit = tweetsentence.split(name)
            if len(tweetsplit) > 1:
                for part in tweetsplit:
                    if len(part) > 1:
                        parttoken = word_tokenize(part)
                        if len(parttoken) > 1:
                            immediateWords = [parttoken[0], parttoken[1]]
                        else:
                            immediateWords = [parttoken[0]]
                        
                        if contains(immediateWords, keywordlist) == True:
                            nameslist.append(name)
                            sentenceB4 = tweetsplit[0]
                            additionalNamesParsing.append(sentenceB4)
        i += 1
        
    for b4string in additionalNamesParsing:
        for name in names:
            if name in b4string:
                nameslist.append(name)
    
    #return nameslist
    return nameslist

In [74]:
def nameCountValidate(inputList):
    freqDict = {x:inputList.count(x) for x in inputList}
    #print(freqDict)
    names = freqDict.keys()
    numCounts = len(inputList)
    numNames = len(names)
    names_del_list = []
    
    for name in names:
        if freqDict[name] / numCounts < 1 / numNames:
            #print("To delete " + name)
            #print("Occurance percentage: " + str(freqDict[name] / numCounts))
            #print("Threshold Occurance: " + str(1 / numNames))
            names_del_list.append(name)
    
    #print(names_del_list)
    for name in names_del_list:
        del freqDict[name]
        
    return freqDict

In [76]:
for award in awardcat:
    tweetSeq = tweetTextContain(tweet_dic[award], keyword)
    category_tweet = [tweet_dic[award][i] for i in tweetSeq]
    print()
    print(award + ":")
    tweetInfo = subjverb_tweets(category_tweet, pseudo_namelist, keyword)
    presenter_info = nameCountValidate(tweetInfo)
    presenter_info_list = presenter_info.keys()
    for presenter in presenter_info:
        presenter_proper = presenter.title()
        print(presenter_proper)


Best Motion Picture - Drama:
Bill Clinton

Best Motion Picture - Musical or Comedy:
Lee Jones

Best Performance by an Actress in a Motion Picture - Drama:
George Clooney

Best Performance by an Actor in a Motion Picture - Drama:
Jennifer Lawrence
Bradley Cooper

Best Performance by an Actress in a Motion Picture - Musical or Comedy:
Kristen Wiig
Will Ferrell
Will Ferrel

Best Performance by an Actor in a Motion Picture - Musical or Comedy:

Best Performance by an Actress in a Supporting Role in any Motion Picture:

Best Performance by an Actor in a Supporting Role in any Motion Picture:
Kate Hudson
Bradley Cooper

Best Director - Motion Picture:

Best Screenplay - Motion Picture:
Amanda Seyfried
Robert Pattinson

Best Motion Picture - Animated:

Best Motion Picture - Foreign Language:
Arnold Schwarzenegger
Sylvester Stallone

Best Original Score - Motion Picture:

Best Original Song - Motion Picture:

Best Television Series - Drama:

Best Television Series - Musical or Comedy:

Best T

tweetInfo = subjverb_tweets(category_tweet, pseudo_namelist, keyword)

#test_name_list = ['dustin hoffman']

tweetSeq = tweetTextContain(tweet_dic['Best Original Score - Motion Picture'], keyword)
category_tweet = [tweet_dic['Best Original Score - Motion Picture'][i] for i in tweetSeq]
print(category_tweet)
tweetInfo = subjverb_tweets(category_tweet, pseudo_namelist, keyword)
tweetInfo

presenter_info = nameCountValidate(tweetInfo)
presenter_info 

for presenter_name in presenters:
    appearance = 0
    for tweetnumber in tweetSeq2:
        #print(tweet_dic[u'Cecil B. deMille Award'][tweetnumber])
        if presenter_name in tweet_dic[u'Cecil B. deMille Award'][tweetnumber].lower():
            print(presenter_name)
            appearance += 1

bd_present_text = bd_present[1]
pr_common_bigrams = bigramsDict(presenter_tokenized_list)
presenters = extractPseudoNames(pr_common_bigrams, lowbound = 10)

appearance