In [None]:
## What's done here

"""
* Step 1. Converted frame1.7 raw data into a normalized form as follows:
      {'argInfoList': [(u"with foster parents duped into incubating the cuckoo 's egg", # text
                        'Other', # frame name
                         5, # left word index
                         15)], # right word index
       'targetInfo': (u'empath', 'Others_situation_as_stimulus', 4, 5), # same as above
       'text': u"It is easy to empathize with foster parents duped into incubating the cuckoo 's eggs ."} 
          # sentence text
 
* Step 2. Following Michael's tutorial at https://github.com/google/sling/issues/82 to directly read the
          corpus into commons, docschema, and created a .rec file from there.
"""

In [1]:
import dill
import sling
from nltk.corpus import framenet as fn
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
def lemmatize(word):
    return porter_stemmer.stem(word)

from collections import defaultdict

In [6]:
# lex2frame = dill.load(open("/usr/local/google/home/wangsu/SLING_DEV/temp/lex2frame.p", 'rb'))
# lexSet = set(lex2frame.keys())

### Save frame sentences (annotated)

In [8]:
fS1 = fn.sents()[0]
fS2 = fn.sents()[1]

In [3]:
fS1.keys()

[u'text',
 u'frame',
 u'LU',
 u'_ascii',
 u'frameAnnotation',
 u'Other',
 u'FE',
 u'POS_tagset',
 u'Sent',
 u'PT',
 'corpID',
 u'POS',
 u'Verb',
 u'_type',
 u'Target',
 'sentNo',
 u'annotationSet',
 u'GF',
 'aPos',
 'ID',
 'docID',
 'paragNo']

In [4]:
fS1['text']

u"It is easy to empathize with foster parents duped into incubating the cuckoo 's eggs ."

In [5]:
fS1['Target']

[(14, 23)]

In [7]:
fS1['FE']

([(24, 84, 'Other')], {'Experiencer': 'CNI'})

In [9]:
fS2['FE']

([(0, 5, 'Experiencer'), (22, 31, 'Situation')], {})

In [57]:
def char_span_to_word_span(text, leftCharIndex, rightCharIndex):
    """Convert char span to word span index."""
    token = text[leftCharIndex:rightCharIndex].split()
    textSplit = text.split()
    leftWordIndex, rightWordIndex = textSplit.index(token[0]), textSplit.index(token[-1])+1
    return leftWordIndex, rightWordIndex

def get_frame_sent_info():
    """Get a list of dictionaries where each contains frame annotation information.
    
    Example entry:
    {'argInfoList': [(u"with foster parents duped into incubating the cuckoo 's egg",
     'Other', 5, 15)], # tuple: text (target or arg), frame name, left word index, right word index.
     'targetInfo': (u'empath', 'Others_situation_as_stimulus', 4, 5),
     'text': u"It is easy to empathize with foster parents duped into incubating the cuckoo 's eggs ."}
    """
    sentInfoList = []
    errorLog = []
    for sentIndex, sent in enumerate(fn.sents()):
        if sentIndex!=0 and sentIndex%1000==0:
            print "... processed %d sentences" % (sentIndex)
        try:
            text = sent['text']
            leftTargetCharIndex, rightTargetCharIndex = sent['Target'][0]
            target = lemmatize(text[leftTargetCharIndex:rightTargetCharIndex])
            targetFrame = sent['frame']['name']
            leftTargetWordIndex, rightTargetWordIndex = char_span_to_word_span(text, 
                                                                               leftTargetCharIndex, 
                                                                               rightTargetCharIndex)
            targetInfo = (target, targetFrame, leftTargetWordIndex, rightTargetWordIndex)
            argInfoList = []
            for leftArgCharIndex,rightArgCharIndex,frame in sent['FE'][0]: # use the frame actually read.
                arg = lemmatize(text[leftArgCharIndex:rightArgCharIndex])
                argFrame = frame
                leftArgWordIndex, rightArgWordIndex = char_span_to_word_span(text,
                                                                             leftArgCharIndex,
                                                                             rightArgCharIndex)
                argInfo = (arg, argFrame, leftArgWordIndex, rightArgWordIndex)
                argInfoList.append(argInfo)
            sentInfo = {}
            sentInfo['text'] = text
            sentInfo['targetInfo'] = targetInfo
            sentInfo['argInfoList'] = argInfoList
            sentInfoList.append(sentInfo)
        except:
            errorLog.append(sentIndex)
    return sentInfoList, errorLog

In [83]:
%%time

frameV17SentInfoList, errorLog = get_frame_sent_info()

In [84]:
# a # 100 samples

In [59]:
frameSentSavePath = "/usr/local/google/home/wangsu/SLING_DEV/temp/sentInfo.p"
dill.dump(frameV17SentInfoList, open(frameSentSavePath, 'wb'))

In [60]:
frameV17SentInfoList[0]

{'argInfoList': [(u"with foster parents duped into incubating the cuckoo 's egg",
   'Other',
   5,
   15)],
 'targetInfo': (u'empath', 'Others_situation_as_stimulus', 4, 5),
 'text': u"It is easy to empathize with foster parents duped into incubating the cuckoo 's eggs ."}

### Create FrameV17 annotated corpus

In [69]:
frameV17SentInfoList[0]

{'argInfoList': [(u"with foster parents duped into incubating the cuckoo 's egg",
   'Other',
   5,
   15)],
 'targetInfo': (u'empath', 'Others_situation_as_stimulus', 4, 5),
 'text': u"It is easy to empathize with foster parents duped into incubating the cuckoo 's eggs ."}

In [80]:
def create_annotations_from_sent_info(sentInfoList, docPath):
    """Create a sling.Store object from corpus.
    
    NOTE: the stuff's pretty fast (although it's not directly picklable).
    NOTE: tutorial source: https://github.com/google/sling/issues/82
    """
    # Create commons Store
    print "Starting creation of commons Store ...\n"
    commons = sling.Store()
    docschema = sling.DocumentSchema(commons)

    isaFrame = commons["isa"]
    argFrame = commons["/saft/arg"]

    count = 0
    for sentInfo in sentInfoList:
        count += 1
        if count%10000==0:
            print "... processed %d sentInfo entries" % (count)
        _ = commons["/fn/"+sentInfo['targetInfo'][1]]
        for arg,frame,leftWordIndex,rightWordIndex in sentInfo['argInfoList']:
            _ = commons["/fn/"+frame]
    print "\nIn total %d sentInfo entries are processed!\n\n" % (count)

    commons.freeze()
    
    # Record docschema
    print "Starting annotations ...\n"
    out = sling.RecordWriter(docPath)
    recNo = 0
    for sentInfo in sentInfoList:
        store = sling.Store(commons)
        doc = sling.tokenize(str(sentInfo['text'].encode('utf-8')), store=store, schema=docschema)
        currArgFrameList = [store.frame({isaFrame:argFrame}) for _ in sentInfo['argInfoList']]
        currTargetFrameDict = {isaFrame:sentInfo['targetInfo'][1]}
        for currArgFrame in currArgFrameList:
            currTargetFrameDict[isaFrame] = currArgFrame
        currTargetFrame = store.frame(currTargetFrameDict)
        leftWordIndex, rightWordIndex = sentInfo['targetInfo'][2],sentInfo['targetInfo'][3]
        doc.add_mention(leftWordIndex,rightWordIndex).evoke(currTargetFrame)
        for currArgFrame,(_,_,leftWordIndex,rightWordIndex) in zip(currArgFrameList,sentInfo['argInfoList']):
            doc.add_mention(leftWordIndex,rightWordIndex).evoke(currArgFrame)
        doc.update()
        out.write(str(recNo), doc.frame.data(binary=True))
        recNo += 1
        if recNo%10000==0:
            print "... processed %d entries annotated" % (recNo)
    out.close()
    print "\nDocumentSchema creation done!\n\n"  

In [81]:
docPath = "/usr/local/google/home/wangsu/SLING_DEV/temp/fn_v17.rec"

In [82]:
%%time

create_annotations_from_sent_info(frameV17SentInfoList, docPath)

Starting creation of commons Store ...

... processed 10000 sentInfo entries
... processed 20000 sentInfo entries
... processed 30000 sentInfo entries
... processed 40000 sentInfo entries
... processed 50000 sentInfo entries
... processed 60000 sentInfo entries
... processed 70000 sentInfo entries
... processed 80000 sentInfo entries
... processed 90000 sentInfo entries
... processed 100000 sentInfo entries
... processed 110000 sentInfo entries
... processed 120000 sentInfo entries
... processed 130000 sentInfo entries
... processed 140000 sentInfo entries
... processed 150000 sentInfo entries
... processed 160000 sentInfo entries
... processed 170000 sentInfo entries
... processed 180000 sentInfo entries
... processed 190000 sentInfo entries
... processed 200000 sentInfo entries

In total 200575 sentInfo entries are processed!


Starting annotations ...

... processed 10000 entries annotated
... processed 20000 entries annotated
... processed 30000 entries annotated
... processed 4000