In [161]:
import re
import nltk
import pandas as pd
import numpy as np
from google.cloud import language
import random

In [10]:
language_client = language.Client()

In [177]:
def googleAnnotate(fileName, nrows=None, saveFile=False):
    '''
    Takes a csv file in the form of [speaker, dialogue]
    Pass each dialogue into google language API
    Attach the output to the dataframe: [speaker, dialogue, sentences, sentiment, entities, tokens]
    sentences contain a dict {content, begin(index), (sentiment)score, (sentiment)magnitude} for each sentence
    sentiment contain {score, magnitude} for the entire dialogue
    entities contain {name, type, metadata, salience, mentions} for each entity
    tokens contain {content, begin(index), pos(partofspeech), index(edge index), label(edge label), lemma} for each token
    nrows can be used to read small number of rows for testing
    '''
    language_client = language.Client()
    df = pd.read_csv(fileName, names=['speaker', 'dialogue'], nrows=nrows)
    df['sentences'] = None
    df['sentiment'] = None
    df['entities'] = None
    df['tokens'] = None
    for i in range(len(df)):
        document = language_client.document_from_text(df.loc[i, 'dialogue'])
        annText = document.annotate_text()
        df.set_value(i, 'sentences', [{'content':x.content, 'begin':x.begin, 'score':x.sentiment.score, 
                                       'magnitude':x.sentiment.magnitude} for x in annText.sentences])
        df.set_value(i, 'sentiment', {'score':annText.sentiment.score, 'magnitude':annText.sentiment.magnitude})
        df.set_value(i, 'entities', [{'name':x.name, 'type':x.entity_type, 'meta':x.metadata, 
                                      'salience':x.salience, 'mentions':x.mentions} for x in annText.entities])
        df.set_value(i, 'tokens', [{'content':x.text_content, 'begin':x.text_begin, 'pos':x.part_of_speech, 
                                    'index':x.edge_index, 'label':x.edge_label, 'lemma':x.lemma} for x in annText.tokens])
    if saveFile:
        df.to_csv(fileName[:-4] + '_gapi.csv')
        #print(fileName[:-4])
    return df

def simpleRE(tokens):
    relation = []
    nsubj = -1
    verb = -1
    for i, token in enumerate(tokens):
        if token['label'] == 'NSUBJ':
            nsubj = i
            verb = token['index']
        if 'OBJ' in token['label'] and token['index'] == verb:
            relation.append({'verb':tokens[verb]['content'], 'noun':tokens[nsubj]['content'], 'obj':tokens[i]['content']})
        
    if relation:
        return relation
    else:
        return None

def REPrecision(df, num=50):
    goodRE = 0.0
    for i in range(num):
        t = df[df.relations.notnull()].sample(1)
        print 'DIALOGUE:', t.dialogue
        print 'RELATION:', random.choice(t.relations.values[0])
        x = raw_input('Is this a good relation?')
        if not x or x[0] == 'y':
            goodRE += 1
    print 'precision: ', goodRE/num
        

In [171]:
df = googleAnnotate('prep_scripts/thor_tw.csv',saveFile=True)
df.head()

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens
0,narrator,first lines,"[{u'content': u'first lines', u'begin': 0, u's...","{u'score': 0, u'magnitude': 0}","[{u'salience': 1, u'meta': {}, u'type': u'OTHE...","[{u'index': 1, u'begin': 0, u'pos': u'ADJ', u'..."
1,Jane Foster,Wait for it.,"[{u'content': u'Wait for it.', u'begin': 0, u'...","{u'score': 0, u'magnitude': 0}",[],"[{u'index': 0, u'begin': 0, u'pos': u'VERB', u..."
2,Darcy Lewis,Can I turn on the radio?,"[{u'content': u'Can I turn on the radio?', u'b...","{u'score': 0, u'magnitude': 0}","[{u'salience': 1, u'meta': {}, u'type': u'OTHE...","[{u'index': 2, u'begin': 0, u'pos': u'VERB', u..."
3,Jane Foster,No!,"[{u'content': u'No!', u'begin': 0, u'score': -...","{u'score': -0.4, u'magnitude': 0.4}",[],"[{u'index': 0, u'begin': 0, u'pos': u'X', u'la..."
4,Erik Selvig,"Jane, you can't keep doing this.","[{u'content': u'Jane, you can't keep doing thi...","{u'score': -0.6, u'magnitude': 0.6}","[{u'salience': 1, u'meta': {u'mid': u'/m/05d4r...","[{u'index': 5, u'begin': 0, u'pos': u'NOUN', u..."


In [47]:
print sum(df.entities.apply(len)), 'entities identified'

entList = df.entities.apply(lambda x: [ent['name'] for ent in x]).tolist()
entSet = set([val for x in entList for val in x])
print len(entSet), 'unique entities identified'

df['totalSent'] = df.sentiment.apply(lambda x: x['score']*x['magnitude'])

print('*'*50)
df_totalSent = df[['speaker', 'totalSent']].groupby('speaker').sum().reset_index()
print('good guys from the script:')
print(df_totalSent[df_totalSent.totalSent >= 0].sort_values(by='totalSent', ascending=False))
print('*'*50)
print('bad guys from the script:')
print(df_totalSent[df_totalSent.totalSent < 0].sort_values(by='totalSent'))

1222 entities identified
539 unique entities identified
**************************************************
good guys from the script:
            speaker  totalSent
9       Jane Foster       1.90
16             Thor       1.27
3       Erik Selvig       1.24
5            Frigga       0.76
15              Sif       0.58
19       Young Thor       0.57
10      King Laufey       0.49
12        Nick Fury       0.49
8             Hogun       0.43
14  Pet Store Clerk       0.21
1     Agent Coulson       0.13
18       Young Loki       0.03
20         narrator       0.01
0   Admission Nurse       0.00
**************************************************
bad guys from the script:
               speaker  totalSent
13                Odin      -1.95
4              Fandral      -0.91
11                Loki      -0.71
7             Heimdall      -0.65
6   Frost Giant Sentry      -0.49
17            Volstagg      -0.12
2          Darcy Lewis      -0.01


In [173]:
#basic relationship extraction
df['relations'] = None
for i in range(len(df)):
    df.set_value(i, 'relations', simpleRE(df.loc[i, 'tokens']))

In [114]:
df.head(10)

Unnamed: 0,speaker,dialogue,sentences,sentiment,entities,tokens,totalSent,relations
0,narrator,first lines,"[{u'content': u'first lines', u'begin': 0, u's...","{u'score': 0, u'magnitude': 0}","[{u'salience': 1, u'meta': {}, u'type': u'OTHE...","[{u'index': 1, u'begin': 0, u'pos': u'ADJ', u'...",0.0,
1,Jane Foster,Wait for it.,"[{u'content': u'Wait for it.', u'begin': 0, u'...","{u'score': 0, u'magnitude': 0}",[],"[{u'index': 0, u'begin': 0, u'pos': u'VERB', u...",0.0,
2,Darcy Lewis,Can I turn on the radio?,"[{u'content': u'Can I turn on the radio?', u'b...","{u'score': 0, u'magnitude': 0}","[{u'salience': 1, u'meta': {}, u'type': u'OTHE...","[{u'index': 2, u'begin': 0, u'pos': u'VERB', u...",0.0,
3,Jane Foster,No!,"[{u'content': u'No!', u'begin': 0, u'score': -...","{u'score': -0.4, u'magnitude': 0.4}",[],"[{u'index': 0, u'begin': 0, u'pos': u'X', u'la...",-0.16,
4,Erik Selvig,"Jane, you can't keep doing this.","[{u'content': u'Jane, you can't keep doing thi...","{u'score': -0.6, u'magnitude': 0.6}","[{u'salience': 1, u'meta': {u'mid': u'/m/05d4r...","[{u'index': 5, u'begin': 0, u'pos': u'NOUN', u...",-0.36,
5,Jane Foster,The last seventeen occurrences had been predic...,[{u'content': u'The last seventeen occurrences...,"{u'score': -0.2, u'magnitude': 0.2}","[{u'salience': 0.57084084, u'meta': {}, u'type...","[{u'index': 3, u'begin': 0, u'pos': u'DET', u'...",-0.04,
6,Erik Selvig,"Jane, you're an astrophysicist, not some storm...","[{u'content': u'Jane, you're an astrophysicist...","{u'score': 0.2, u'magnitude': 0.2}","[{u'salience': 0.83948606, u'meta': {u'mid': u...","[{u'index': 3, u'begin': 0, u'pos': u'NOUN', u...",0.04,
7,Jane Foster,"I'm telling you, there's a connection between ...","[{u'content': u'I'm telling you, there's a con...","{u'score': 0, u'magnitude': 1.1}","[{u'salience': 0.35660744, u'meta': {}, u'type...","[{u'index': 2, u'begin': 0, u'pos': u'PRON', u...",0.0,"[{u'obj': u'you', u'verb': u'telling', u'noun'..."
8,Erik Selvig,I thought you said it was a subtle aurora!,[{u'content': u'I thought you said it was a su...,"{u'score': 0.1, u'magnitude': 0.1}","[{u'salience': 1, u'meta': {}, u'type': u'LOCA...","[{u'index': 1, u'begin': 0, u'pos': u'PRON', u...",0.01,
9,Jane Foster,[to Darcy] Go!,"[{u'content': u'[to Darcy] Go!', u'begin': 0, ...","{u'score': 0.2, u'magnitude': 0.2}","[{u'salience': 1, u'meta': {}, u'type': u'PERS...","[{u'index': 4, u'begin': 0, u'pos': u'PUNCT', ...",0.04,


In [152]:
for i in range(50):
    t = df[df.relations.notnull()].sample(1)
    print 'dialogue:', t.dialogue
    #print len(t.relations)
    for relation in t.relations.values[0]:
        print relation

dialogue: 124    Father! Let's finish them together!
Name: dialogue, dtype: object
{'obj': u'them', 'verb': u'finish', 'noun': u"'s"}
dialogue: 291    Oh, so you own a satellite now?
Name: dialogue, dtype: object
{'obj': u'satellite', 'verb': u'own', 'noun': u'you'}
dialogue: 432    Then you'll open the Bifrost to no one. Until ...
Name: dialogue, dtype: object
{'obj': u'Bifrost', 'verb': u'open', 'noun': u'you'}
{'obj': u'damage', 'verb': u'repaired', 'noun': u'I'}
dialogue: 425    Or perhaps someone has found a way to hide tha...
Name: dialogue, dtype: object
{'obj': u'way', 'verb': u'found', 'noun': u'someone'}
{'obj': u'me', 'verb': u'wish', 'noun': u'he'}
dialogue: 453    Thor would do the same for us.
Name: dialogue, dtype: object
{'obj': u'same', 'verb': u'do', 'noun': u'Thor'}
dialogue: 225    At least he's been banished, not dead. Which i...
Name: dialogue, dtype: object
{'obj': u'them', 'verb': u'told', 'noun': u'guard'}
dialogue: 366    He committed a crime! He's in jail!
Na

In [179]:
REPrecision(df)

DIALOGUE: 530    Can you see her?
Name: dialogue, dtype: object
RELATION: {'obj': u'her', 'verb': u'see', 'noun': u'you'}
Is this a good relation?n
DIALOGUE: 246    No. In the aftermath of the battle, I went int...
Name: dialogue, dtype: object
RELATION: {'obj': u'baby', 'verb': u'found', 'noun': u'I'}
Is this a good relation?n
DIALOGUE: 423    I turned my gaze upon you in Jotunheim but cou...
Name: dialogue, dtype: object
RELATION: {'obj': u'gaze', 'verb': u'turned', 'noun': u'I'}
Is this a good relation?n
DIALOGUE: 393    Farewell. [he turns and leaves just as Agent C...
Name: dialogue, dtype: object
RELATION: {'obj': u'room', 'verb': u'enters', 'noun': u'Coulson'}
Is this a good relation?y
DIALOGUE: 365    No. I'm gonna fly out. [he leaves her] [after ...
Name: dialogue, dtype: object
RELATION: {'obj': u'her', 'verb': u'leaves', 'noun': u'he'}
Is this a good relation?n
DIALOGUE: 447    Our dearest friend banished. Loki on the thron...
Name: dialogue, dtype: object
RELATION: {'obj': 