In [1]:
import json
import requests
class StanfordCoreNLP:
    '''
    Wrapper for Starford Corenlp Restful API
    annotators:"truecase,tokenize,ssplit,pos,lemma,ner,regexner,parse,depparse,openie,coref,kbp,sentiment"
    nlp = StanfordCoreNLP()
    output = nlp.annotate(text, properties={ 'annotators':'kbp','outputFormat': 'json',})
    output.keys()
    dict_keys(['sentences', 'corefs'])
    
    '''

    def __init__(self, host='127.0.0.1', port='9000'):
        self.host = host
        self.port = port

    def annotate(self, data, properties=None, lang='en'):
        self.server_url = 'http://'+self.host+':'+self.port
        properties['outputFormat'] = 'json'
        try:
            res = requests.post(self.server_url,
                                params={'properties': str(properties),
                                        'pipelineLanguage':lang},
                                data=data, 
                                headers={'Connection': 'close'})
            return res.json()
        except Exception as e:
            print(e)

In [2]:
snlp = StanfordCoreNLP(host='10.0.1.7')

In [71]:
from gensim.models import KeyedVectors
from gensim.similarities.index import AnnoyIndexer
wv = KeyedVectors.load_word2vec_format('glove.6B.300d.bin', binary=True)
annoy_index = AnnoyIndexer()
annoy_index.load('glove.6B.300d.index')
annoy_index.model = wv



In [113]:
vocab_disasters_and_accdients = dict(wv.most_similar(positive=['disaster','accident',], topn=25000, indexer=annoy_index))

In [114]:
list(vocab_disasters_and_accdients.items())[-10:]

[('teutoburg', 0.2927718758583069),
 ('scranton', 0.2927703857421875),
 ('steffi', 0.29277002811431885),
 ('constructive', 0.2927693724632263),
 ('so-called', 0.2927691340446472),
 ('floodlight', 0.2927657961845398),
 ('worldnews@ap.org', 0.2927653193473816),
 ('g/mol', 0.2927647829055786),
 ('binga', 0.29276204109191895),
 ('single-goal', 0.2927611470222473)]

In [115]:
import dask.bag as db

In [135]:
def safe_load(text):
    try:
        return json.loads(text)
    except:
        return {}

In [136]:
news_event = db.read_text('./news_event.jsonl').map(safe_load)

In [143]:
def category_filter(sample):
    if not sample:
        return False
    return sample['event']['category'].startswith('disaster')

In [144]:
news_event_disasters_and_accdients = news_event.filter(category_filter)

In [139]:
news_event_disasters_and_accdients.take(1)[0]['event']

{'category': 'disaster and accident',
 'date': '2010-07-01',
 'description': 'Hurricane Alex, the first hurricane of the 2010 Atlantic hurricane season, makes landfall in northeastern Mexico as a Category\xa02 hurricane on the Saffir-Simpson Hurricane Scale with winds of 105\xa0mph (165\xa0km/h), and causes tornadoes that force people into shelters in southern Texas.',
 'id': '59fbc9fa60b18848c5a4ce61',
 'media': ['National Hurricane Center', 'The Australian', 'Aljazeera'],
 'title': ''}

In [12]:
sample = news_event_disasters_and_accdients.take(1)[0]

In [13]:
sample.keys()

dict_keys(['event', 'tweets'])

In [14]:
sample['event']

{'category': 'disaster and accident',
 'date': '2010-07-01',
 'description': 'Hurricane Alex, the first hurricane of the 2010 Atlantic hurricane season, makes landfall in northeastern Mexico as a Category\xa02 hurricane on the Saffir-Simpson Hurricane Scale with winds of 105\xa0mph (165\xa0km/h), and causes tornadoes that force people into shelters in southern Texas.',
 'id': '59fbc9fa60b18848c5a4ce61',
 'media': ['National Hurricane Center', 'The Australian', 'Aljazeera'],
 'title': ''}

In [15]:
len(sample['tweets'])

60

In [16]:
sample['tweets'][0]

{'boe_cosine': 0.9158096833057836,
 'created_at': '2010-06-29 13:22:38',
 'data_name': 'Jacqui Jeras',
 'favorites': 0,
 'has_url': False,
 'hashtags': 0,
 'id': '17331748501',
 'is_reply': False,
 'is_retweet': False,
 'lang': 'en',
 'mentions': 0,
 'replies': 0,
 'retweets': 1,
 'screen_name': 'JacquiJerasTV',
 'text': 'Alex nearly a hurricane; winds 70mph. Texas will start to feel TS force winds tomorrow. Heavy rains expected 5-10". Landfall near border.',
 'userbadges': 'Verifizierter Account'}

In [17]:
demo_event = sample['event']['description']

In [18]:
demo_tweet = sample['tweets'][0]['text']

In [19]:
demo = 'I like and hate him.'

In [20]:
out = snlp.annotate(demo_event, properties={ 'annotators':'ner,depparse','outputFormat': 'json',})

In [21]:
out.keys()

dict_keys(['sentences'])

In [22]:
len(out['sentences'])

1

In [23]:
out['sentences'][0].keys()

dict_keys(['index', 'basicDependencies', 'enhancedDependencies', 'enhancedPlusPlusDependencies', 'entitymentions', 'tokens'])

In [24]:
out['sentences'][0]['index']

0

In [None]:
tokens_dict = {}
[tokens_dict.update({t['index']:{'lemma':t['lemma'].lower(),'word':t['word'],'pos':t['pos']}}) for t in out['sentences'][0]['tokens']]

In [None]:
tokens_dict

In [None]:
list(tokens_dict.values())

In [28]:
entity = [{'text':e['text'].lower(),'ner':e['ner'].lower(),'tokenBegin':e['tokenBegin']+1,'tokenEnd':e['tokenEnd'],} for e in out['sentences'][0]['entitymentions']]

In [None]:
entity

In [30]:
NER_ = ['person','ordinal','number','date']

In [31]:
entity_dict = {}
span_entity = []
entity_triples = []
for e in entity:
    #entity_dict.update({e['tokenEnd']:e['ner'] if e['ner'] in NER_ else \
    #                    [e['ner'],e['text'] if e['tokenEnd'] == e['tokenBegin']\
    #                     else ''.join([tokens_dict[i]['lemma'] for i in range(e['tokenBegin'],e['tokenEnd']+1)])]})
    entity_dict.update({e['tokenEnd']:e['ner'] if e['ner'] in NER_ else e['text']})#[e['ner'],e['text']]})
    if e['ner'] not in NER_:
        entity_triples.append(('/c/en/'+e['text'],'/r/tweet/IsA','/c/en/'+e['ner']))
    if e['tokenEnd'] != e['tokenBegin']:
        span_entity.append((e['tokenBegin'],e['tokenEnd']))

In [32]:
entity_dict

{1: 'hurricane',
 2: 'person',
 5: 'ordinal',
 6: 'hurricane',
 9: 'date',
 10: 'atlantic',
 11: 'hurricane',
 18: 'mexico',
 22: 'number',
 23: 'hurricane',
 27: 'hurricane',
 32: 'number',
 35: 'number',
 49: 'texas'}

In [33]:
span_entity

[]

In [34]:
entity_triples

[('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/atlantic', '/r/tweet/IsA', '/c/en/location'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/mexico', '/r/tweet/IsA', '/c/en/country'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/texas', '/r/tweet/IsA', '/c/en/state_or_province')]

In [35]:
dep = out['sentences'][0]['enhancedPlusPlusDependencies']

In [None]:
dep

In [37]:
DEP_ = ['amod','punct','case','det','ROOT']

In [38]:
from nltk.corpus import stopwords
list_stopWords=list(set(stopwords.words('english')))

In [39]:
dep = [{'from':d['governor'],'dep':d['dep'],'to':d['dependent']} for d in dep if d['dep'] not in DEP_ \
       and (d['dependent'],d['governor']) not in span_entity and d['dependentGloss'] not in list_stopWords\
      and d['governorGloss'] not in list_stopWords]

In [None]:
dep

In [41]:
def dep_to_str(index):
    return entity_dict[index] if index in entity_dict.keys() else tokens_dict[index]['lemma']

In [42]:
dep_triples =[('/c/en/'+dep_to_str(d['to']),'/r/tweet/dep/'+d['dep'],'/c/en/'+dep_to_str(d['from'])) for d in dep]

In [None]:
dep_triples

In [44]:
from collections import defaultdict

In [45]:
fact_dict = defaultdict(dict)
for d in dep:
    if d['dep'].startswith('nsubj') or d['dep'].startswith('dobj'):
            fact_dict[d['from']].update({d['dep']:d['to']})

In [46]:
fact_dict

defaultdict(dict,
            {14: {'dobj': 15, 'nsubj': 2},
             40: {'dobj': 41, 'nsubj': 2},
             43: {'dobj': 44, 'nsubj': 41}})

In [47]:
fact_triples = []

In [48]:
for k,v in fact_dict.items():
    if 'nsubj' in v.keys() and 'dobj' in v.keys():
        fact_triples.append(('/c/en/'+dep_to_str(v['nsubj']),'/r/tweet/open/'+dep_to_str(k),'/c/en/'+dep_to_str(v['dobj'])))

In [49]:
fact_triples

[('/c/en/person', '/r/tweet/open/make', '/c/en/landfall'),
 ('/c/en/person', '/r/tweet/open/cause', '/c/en/tornado'),
 ('/c/en/tornado', '/r/tweet/open/force', '/c/en/people')]

In [50]:
triples = entity_triples + dep_triples + fact_triples

In [51]:
triples

[('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/atlantic', '/r/tweet/IsA', '/c/en/location'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/mexico', '/r/tweet/IsA', '/c/en/country'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/hurricane', '/r/tweet/IsA', '/c/en/cause_of_death'),
 ('/c/en/texas', '/r/tweet/IsA', '/c/en/state_or_province'),
 ('/c/en/hurricane', '/r/tweet/dep/compound', '/c/en/person'),
 ('/c/en/person', '/r/tweet/dep/nsubj', '/c/en/make'),
 ('/c/en/person', '/r/tweet/dep/nsubj', '/c/en/cause'),
 ('/c/en/hurricane', '/r/tweet/dep/appos', '/c/en/person'),
 ('/c/en/date', '/r/tweet/dep/nummod', '/c/en/season'),
 ('/c/en/atlantic', '/r/tweet/dep/compound', '/c/en/season'),
 ('/c/en/hurricane', '/r/tweet/dep/compound', '/c/en/season'),
 ('/c/en/season', '/r/tweet/dep/nmod:of', '/c/en/hurricane'),
 ('/c/en/landfall', '/r/tweet/dep/dobj', '/c/

In [122]:
from collections import defaultdict
from nltk.corpus import stopwords
list_stopWords=list(set(stopwords.words('english')))
NER_ = ['person','ordinal','number','date']
DEP_ = ['amod','punct','case','det','ROOT','dep']
def dep_to_str(tokens_dict,entity_dict,index):
    return entity_dict[index] if index in entity_dict.keys() else tokens_dict[index]['lemma']

def annotate_sentences(text):
    out = snlp.annotate(text, properties={ 'annotators':'ner,depparse','outputFormat': 'json'})
    #import pdb;pdb.set_trace()
    triples = []
    for sentence in out['sentences']:
        #print(sentence['index'])
        triples.extend(get_triples(sentence))
    return triples
    
def get_triples(sentence):
    # tokens 
    tokens_dict = {}
    [tokens_dict.update({t['index']:{'lemma':t['lemma'].lower(),'word':t['word'],'pos':t['pos']}}) for t in sentence['tokens']]
    # entity 
    entity = [{'text':e['text'].lower(),'ner':e['ner'].lower(),'tokenBegin':e['tokenBegin']+1,'tokenEnd':e['tokenEnd'],}\
              for e in sentence['entitymentions']]
    entity_dict = {}
    span_entity = [] # combine in dep
    entity_triples = []
    for e in entity:
        entity_dict.update({e['tokenEnd']:e['ner'] if e['ner'] in NER_ else e['text']})#[e['ner'],e['text']]})
        if e['ner'] not in NER_:
            entity_triples.append(('/c/en/'+e['text'],'/r/tweet/IsA','/c/en/'+e['ner']))
            #print('/c/en/'+e['text'],'/r/tweet/IsA','/c/en/'+e['ner'])
        if e['tokenEnd'] != e['tokenBegin']:
            span_entity.append((e['tokenBegin'],e['tokenEnd']))
    # dep triples
    # vocab_disasters_and_accdients
    dep = sentence['enhancedPlusPlusDependencies']
    dep = [{'from':d['governor'],'dep':d['dep'],'to':d['dependent']} for d in dep if d['dep'] not in DEP_ \
       and (d['dependent'],d['governor']) not in span_entity and d['dependentGloss'] not in list_stopWords\
      and d['governorGloss'] not in list_stopWords]# and d['dependentGloss'] in vocab_disasters_and_accdients.keys()\
      #and d['governorGloss'] in vocab_disasters_and_accdients.keys()]
    dep_triples =[('/c/en/'+dep_to_str(tokens_dict,entity_dict,d['to']),'/r/tweet/dep/'+d['dep'],\
                   '/c/en/'+dep_to_str(tokens_dict,entity_dict,d['from'])) for d in dep]
    # fact_triples filter from dep
    fact_dict = defaultdict(dict)
    for d in dep:
        if d['dep'].startswith('nsubj') or d['dep'].startswith('dobj'):
                fact_dict[d['from']].update({d['dep']:d['to']})
    fact_triples = []
    for k,v in fact_dict.items():
        if 'nsubj' in v.keys() and 'dobj' in v.keys():
            fact_triples.append(('/c/en/'+dep_to_str(tokens_dict,entity_dict,v['nsubj']),\
                                 '/r/tweet/open/'+dep_to_str(tokens_dict,entity_dict,k),\
                                 '/c/en/'+dep_to_str(tokens_dict,entity_dict,v['dobj'])))
    # cat all triples
    triples = entity_triples + dep_triples + fact_triples
    return triples

In [123]:
demo_tweet = sample['tweets'][2]['text']

In [None]:
event_triples = []
for index,tweet in enumerate(sample['tweets']):
    print(index)
    try:
        print(tweet['text'])
        event_triples.extend(annotate_sentences(tweet['text']))
    except:
        pass

In [92]:
len(event_triples)

955

In [129]:
len(set(event_triples))

402

In [145]:
es = news_event_disasters_and_accdients.take(30000)

In [146]:
len(es)

4878

In [147]:
import codecs

In [148]:
triples = ['a','b']

In [149]:
f = codecs.open('triples.txt','a+',encoding='utf-8')

In [150]:
f.writelines(triples)

In [151]:
f.close()