In [8]:
import numpy as np
import pandas as pd
import re
import pickle
import os
from IPython.core.debugger import set_trace


# Read text

In [2]:
def cleantxt(fname):
    rawname = fname+'_raw'
    if fname[-3:]=='txt':
        rawname = fname[:-4]+'_raw.txt'
    
    with open(fname, 'r') as f, open(rawname, 'a') as rawf:
        content = f.readlines()
        for line in f:
            rawf.write(line)
            
    # remove whitespace at the end. keep the tab \t to mark level of question    
    content = [x.strip('\n') for x in content] 
   
    for line in content:
        if line[0]==' ': # transform white spaces into tab which is used to count levels
            line = line.replace('  ','\t') 
        if line.find('？')>0: # deal with chinese characters
            line = line.replace('？','?')
        if line.find('。')>0:
            line = line.replace('。','.')
        with open(fname, 'w') as wf: # rewrite the original text name so nothing weird in the processed textfile.
            wf.write(line+'\n')
    return content,cleanname
        

# Text to quesnet
For paper notes, the first line will be the title of this paper, which should be linked to the paper data structure.




In [13]:
def getedge(head):
    # get edge
    if head.find('...')>0 and head.find('_')>0: # edge of a group head
        ed = head[head.find('_')+1:head.find('...')]
    else: # edge of single node
        ed = head[head.find('_')+1:head.rfind('_')]
    #unify the edge label names
    if ed == 'a' or ed.lower()=='answer':
        ed = 'answer' # rename the txt shorthand
    elif ed == 'specify':
        ed = 'specification'
    # get speaker
    if head.find('@')>0: # speaker other than this paper -- usually me
        speaker = head[head.find('@')+1:]
    else:
        speaker = ''
    return ed,speaker
def node2label(node):
    if node.find('"')>0:
        label = node[:node.find('"')]
        content = node[node.find('"'):]
    else:
        label = node
        content=""
    return label,content

def txt2quesnet(outdir,combrepQ = True,fin='',nodedist={},alledge=[]):
    for key in ['label','type','id','content','speaker','level']:
        nodedist[key]=[]
    with open(fin,'r') as f:
        content = f.readlines()
    
    # create a paper entity
    thisp = {'id':'Ep%d'%np.random.randint(100000)} #E=entity,p=paper
    #FUTURE: may use other ways to record the paper info
    fname = fin[fin.rfind('/')+1:] # get rid of the file directory 
    print(fname)
    thisp['label']=fname.strip('.txt')
    for ks,s in enumerate(fname):
        if s.isdigit():
            thisp['author'] = fname[:ks]
            break
    thisp['year']=fname[ks:ks+4]
    # if the first line is the paper name
    lstart=0 # which line to start reading
    if content[0][0]=='#':
        thisp['title']=content[0][4:].rstrip()
        lstart=1
    else:
        print('paper title not provided')

    
    # create multiple question threads from each root
    maxlevel = 20 # maximum 20 levels down the tree
    allngrps=[]
    for line in content[lstart:]:
        
        if len(line.strip())<1: # skip empty lines
            continue
        elif line[:2]=='##':#ending sign
            break
        else: # a valid new line
            line = line.rstrip()
            # new...
            if line[0].isalpha(): # a root node doesn't start with tab
                    rootlv=0
                    rootlist = [[] for k in range(maxlevel)]# restart a new root list
                    node = line
                    edge = ''
                    spkr = ''
                    head = ''
            else:
                head = line.split(' ')[0]
                if len(line.split(' '))==1: #either answer group head or end
                    if line[-3:]=='...': #group head
                        nodegrp = [] # for reading parallel nodes
                        hdedge,hdspkr = getedge(head)
                        continue
                    elif line.find('end_')>0:
                        allngrps.append(nodegrp)
                        continue
                    else:
                        print(line)
                        print("can't recognize your grammar...")  
                        continue
                else: #all other normal nodes
                    node = " ".join(line.split(' ')[1:])
                    if head.find('...')<0: # a normal edge; otherwise a member of edge group, edge is given
                        edge,spkr = getedge(head)
                    else:
                        edge,spkr = hdedge,hdspkr #inherent

        # Construct the node dictionary, get nid   
        level = int((len(line) - len(line.strip('\t')))/len('\t')) # an absolute level
        label,content=node2label(node)
        
        if combrepQ and label in nodedist['label']:
            nid = nodedist['id'][nodedist['label'].index(label)]
        else:
            if edge == 'answer':
                nodedist['type'].append('Answer') 
                nid = 'a%d'%np.random.randint(100000)
            elif  edge =='hypothesis':
                nodedist['type'].append('hypothesis') 
                nid = 'ah%d'%np.random.randint(100000)
            else:
                nid = 'q%d'%np.random.randint(100000)
                if level>0:
                    nodedist['type'].append('Question')    
                else:
                    nodedist['type'].append('Root question')
            nodedist['label'].append(label)
            nodedist['id'].append(nid)
            nodedist['content'].append(content)
            nodedist['level'].append(level)
            #nodedist['reference'].append(reference)
            if len(spkr)==0:
                spkr = thisp['id'] # if not specified, default speaker is this paper
            nodedist['speaker'].append(spkr)
        
        if head.find('...')>0:
            nodegrp.append(nid)       

        
        # Construct the edge dictionary    
        rootlist[level]=nid             
        if level>0:
            if len(rootlist[level-1])==0:
                print(node)
                set_trace()
            edgedict = {'start':rootlist[level-1],'end':nid,'label':edge,'id':'l%d'%np.random.randint(100000)}
            if rootlist[level-1] == nid:
                print(level)
                print(line)
                set_trace()
            alledge.append(edgedict)   

            
    nodedf = pd.DataFrame(nodedist)
    edgedf = pd.DataFrame(alledge) 
    #os.makedirs(filedir)
 
    pickle.dump([nodedf,edgedf,allngrps,thisp],open(outdir+thisp['label']+'_qnet.p','bw'))
    print('generated database in:\n'+outdir+thisp['label']+'_qnet.p')
    return thisp # output the paper entity. TODO: other entities mentioned in the nodes should also be added, so that return a full database

In [3]:
outdir='../database/curiosity/devpsych/'

In [19]:
fname='../txtnotes/papernotes/Stahl2015_observing.txt'
Epaper=txt2quesnet(outdir,False,fname) 

Stahl2015_observing.txt
generated database in:
../database/curiosity/devpsych/Stahl2015_observing_qnet.p


In [20]:
Epaper

{'id': 'Ep63386',
 'label': 'Stahl2015_observing',
 'author': 'Stahl',
 'year': '2015',
 'title': 'Observing the unexpected enhances infants’ learning and exploration'}

next step: use "pickle to networkx" notebook to do the visualization

# Graph simplification

## Pure Q net

In [2]:
def getQid(qtext,Qdic):
    qtext = str(qtext)
    if qtext in Qdic['label']:
        idx = Qdic['label'].index(qtext)
        if paper not in Qdic['paper'][idx]:
            Qdic['paper'][idx].append(paper)
        return Qdic['id'][idx]
    else:
        newid = 'Q%d'%np.random.randint(100000)
        Qdic['id'].append(newid)
        Qdic['label'].append(qtext)
        Qdic['paper'].append([paper])
        return newid
def addedge(qids,etext,Edgedic):
    if qids in Edgedic['startNend']:
        idx = Edgedic['startNend'].index(qids)
        if paper not in Edgedic['paper'][idx]:
            Edgedic['paper'][idx].append(paper)
        if etext not in Edgedic['label'][idx]:
            Edgedic['label'][idx].append(etext)
    else:
        Edgedic['startNend'].append(qids)
        Edgedic['id'].append('E%d'%np.random.randint(100000))
        Edgedic['label'].append([etext])
        Edgedic['paper'].append([paper])

In [21]:
outdir='../database/curiosity/devpsych/'
# ignore both the answer and hypothesis
pnames = ['Bonawitz2011-the','Stahl2015_observing']
pname=pnames[1]
[nodedf,edgedf,allngrps,thisp] = pickle.load(open(outdir+pname+'_qnet.p','rb'))
Qedgs =[]

for row in edgedf.itertuples(index=False):
    if row.end[0]=='q':
        thisid = row.end
        if (row.start[0]=='a'): # a to q
            prevqid = edgedf['start'].loc[edgedf['end']==row.start].values[0]
            eid = 'l%d'%np.random.randint(100000)
            if prevqid[0]!='q':
                print('strange! Answer not following a question!')
                set_trace()
                continue
        else:
            prevqid = row.start
            eid = row.id
        elb = row.label

        edgedict = {'start':prevqid,'end':thisid,'label':elb,'id':eid}
        Qedgs.append(edgedict)
Qnodes = []
for node in nodedf.itertuples(index=False):
    if node.id[0]=='q':
        Qnodes.append(node)
Qndf = pd.DataFrame(Qnodes)
Qedf = pd.DataFrame(Qedgs)
pickle.dump([Qndf,Qedf],open(outdir+pname+'_Qonly.p','wb'))

In [4]:
# only ignore the answer but not the hypothesis
[nodedf,edgedf,allngrps,thisp] = pickle.load(open(outdir+pname+'_qnet.p','rb'))
Qedgs =[]

for row in edgedf.itertuples(index=False):
    if row.end[0]=='q' or row.end[:2]=='ah':
        thisid = row.end
        if (row.start[0]=='a' and row.start[1]!='h'): # a to q
            prevqid = edgedf['start'].loc[edgedf['end']==row.start].values[0]
            eid = 'l%d'%np.random.randint(100000)
            if prevqid[0]!='q':
                print('strange! Answer not following a question!')
                set_trace()
                continue
        else:
            prevqid = row.start
            eid = row.id
        elb = row.label

        edgedict = {'start':prevqid,'end':thisid,'label':elb,'id':eid}
        Qedgs.append(edgedict)
Qnodes = []
for node in nodedf.itertuples(index=False):
    if node.id[0]=='q' or row.end[:2]=='ah':
        Qnodes.append(node)
Qndf = pd.DataFrame(Qnodes)
Qedf = pd.DataFrame(Qedgs)
pickle.dump([Qndf,Qedf],open(outdir+pname+'_Qonly.p','wb'))

NameError: name 'pname' is not defined

## reduce levels

In [None]:
def simpg(nodedf,edgedf,level=1):
    newndf = nodedf.loc[nodedf['level']<=level]
    newedf = []
    for row in edgedf.itertuples():
        
        if (nodedf.loc[nodedf['id']==row.start]['level'].values[0])<=(level-1):
            newedf.append(row)
    newedf = pd.DataFrame(newedf)
    return newndf,newedf

In [None]:
[nodedf_l3,edgedf_l3] = simpg(nodedf,edgedf,level=4)

pickle.dump([nodedf_l3,edgedf_l3,allngrps],open(outdir+pname+'_lv4.p','wb'))
