In [5]:
import os 
import spacy
import re
import pandas as pd

nlp = spacy.load('en_core_web_sm')

files = [f for f in os.listdir("data/") if f.endswith('txt')]
def readFiles(filename='5369.txt', dirname='data/'):
    return open(dirname+filename).read().replace('\n', ' ')
def stitch_words(text):
    return re.sub(r'- ',r'', text)
def getNC(doc):
    return [[chunk.text, chunk.root.text, chunk.root.dep_,
              chunk.root.head.text] for chunk in doc.noun_chunks]
def getNER(doc):
    return [[ents.text, ents.label_, ents.start, ents.end] for ents in doc.ents]

In [6]:
df = pd.DataFrame(files, columns=['filename'])
df['rawContent'] = [stitch_words(readFiles(i)) for i in df.filename]
df['doc'] = [nlp(i) for i in df.rawContent]
df['NC'] = [getNC(i) for i in df.doc]
df['NER'] = [getNER(i) for i in df.doc]
df.head()

Unnamed: 0,filename,rawContent,doc,NC,NER
0,1003.txt,ORIGINAL ARTICLE The Epidemiology of Panic Att...,"(ORIGINAL, ARTICLE, The, Epidemiology, of, Pan...","[[ORIGINAL ARTICLE, ARTICLE, ROOT, ARTICLE], [...","[[Panic Disorder, ORG, 8, 10], [Agoraphobia, G..."
1,1007.txt,ORIGINAL ARTICLE Lifetime Prevalence and Age-o...,"(ORIGINAL, ARTICLE, Lifetime, Prevalence, and,...","[[ORIGINAL ARTICLE Lifetime Prevalence, Preval...",[[the National Comorbidity Survey Replication ...
2,1005.txt,MarketWatch Medicaid Cost Containment And Acce...,"(MarketWatch, Medicaid, Cost, Containment, And...","[[MarketWatch Medicaid Cost Containment, Conta...",[[MarketWatch Medicaid Cost Containment And Ac...
3,1006.txt,Unmet Need for Personal Assistance Services: E...,"(Unmet, Need, for, Personal, Assistance, Servi...","[[Unmet, Unmet, nsubj, Need], [Personal Assist...","[[Hours, TIME, 11, 12], [Mitchell P. LaPlante,..."
4,1001.txt,ANNALS OF FAMILY MEDICINE WWW.ANNFAMMED.ORG ...,"(ANNALS, OF, FAMILY, MEDICINE, , WWW.ANNFAMME...","[[FAMILY MEDICINE, MEDICINE, pobj, OF], [AUGUS...","[[FAMILY MEDICINE, ORG, 2, 4], [1, CARDINAL, 9..."


In [7]:
df['NERF'] = [[e for e in i if e[1]=='ORG'] for i in df.NER]

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize

df['sents'] = [[nlp(j) for j in sent_tokenize(i)] for i in df.rawContent]
df.head()

Unnamed: 0,filename,rawContent,doc,NC,NER,NERF,sents
0,1003.txt,ORIGINAL ARTICLE The Epidemiology of Panic Att...,"(ORIGINAL, ARTICLE, The, Epidemiology, of, Pan...","[[ORIGINAL ARTICLE, ARTICLE, ROOT, ARTICLE], [...","[[Panic Disorder, ORG, 8, 10], [Agoraphobia, G...","[[Panic Disorder, ORG, 8, 10], [the National C...","[(ORIGINAL, ARTICLE, The, Epidemiology, of, Pa..."
1,1007.txt,ORIGINAL ARTICLE Lifetime Prevalence and Age-o...,"(ORIGINAL, ARTICLE, Lifetime, Prevalence, and,...","[[ORIGINAL ARTICLE Lifetime Prevalence, Preval...",[[the National Comorbidity Survey Replication ...,[[the National Comorbidity Survey Replication ...,"[(ORIGINAL, ARTICLE, Lifetime, Prevalence, and..."
2,1005.txt,MarketWatch Medicaid Cost Containment And Acce...,"(MarketWatch, Medicaid, Cost, Containment, And...","[[MarketWatch Medicaid Cost Containment, Conta...",[[MarketWatch Medicaid Cost Containment And Ac...,[[MarketWatch Medicaid Cost Containment And Ac...,"[(MarketWatch, Medicaid, Cost, Containment, An..."
3,1006.txt,Unmet Need for Personal Assistance Services: E...,"(Unmet, Need, for, Personal, Assistance, Servi...","[[Unmet, Unmet, nsubj, Need], [Personal Assist...","[[Hours, TIME, 11, 12], [Mitchell P. LaPlante,...",[[Charlene Harrington Disability Statistics Ce...,"[(Unmet, Need, for, Personal, Assistance, Serv..."
4,1001.txt,ANNALS OF FAMILY MEDICINE WWW.ANNFAMMED.ORG ...,"(ANNALS, OF, FAMILY, MEDICINE, , WWW.ANNFAMME...","[[FAMILY MEDICINE, MEDICINE, pobj, OF], [AUGUS...","[[FAMILY MEDICINE, ORG, 2, 4], [1, CARDINAL, 9...","[[FAMILY MEDICINE, ORG, 2, 4], [HMO, ORG, 43, ...","[(ANNALS, OF, FAMILY, MEDICINE, , WWW.ANNFAMM..."


In [118]:
df.head()

Unnamed: 0,filename,rawContent,doc,NC,NER,NERF,docF,sents
0,1003.txt,ORIGINAL ARTICLE The Epidemiology of Panic Att...,"(ORIGINAL, ARTICLE, The, Epidemiology, of, Pan...","[[ORIGINAL ARTICLE, ARTICLE, ROOT, ARTICLE], [...","[[Panic Disorder, ORG, 8, 10], [Agoraphobia, G...","[[Panic Disorder, ORG, 8, 10], [the National C...",ORIGINAL ARTICLE The Epidemiology of Panic Att...,"[(ORIGINAL, ARTICLE, The, Epidemiology, of, Pa..."
1,1007.txt,ORIGINAL ARTICLE Lifetime Prevalence and Age-o...,"(ORIGINAL, ARTICLE, Lifetime, Prevalence, and,...","[[ORIGINAL ARTICLE Lifetime Prevalence, Preval...",[[the National Comorbidity Survey Replication ...,[[the National Comorbidity Survey Replication ...,ORIGINAL ARTICLE The Epidemiology of Panic Att...,"[(ORIGINAL, ARTICLE, Lifetime, Prevalence, and..."
2,1005.txt,MarketWatch Medicaid Cost Containment And Acce...,"(MarketWatch, Medicaid, Cost, Containment, And...","[[MarketWatch Medicaid Cost Containment, Conta...",[[MarketWatch Medicaid Cost Containment And Ac...,[[MarketWatch Medicaid Cost Containment And Ac...,ORIGINAL ARTICLE The Epidemiology of Panic Att...,"[(MarketWatch, Medicaid, Cost, Containment, An..."
3,1006.txt,Unmet Need for Personal Assistance Services: E...,"(Unmet, Need, for, Personal, Assistance, Servi...","[[Unmet, Unmet, nsubj, Need], [Personal Assist...","[[Hours, TIME, 11, 12], [Mitchell P. LaPlante,...",[[Charlene Harrington Disability Statistics Ce...,ORIGINAL ARTICLE The Epidemiology of Panic Att...,"[(Unmet, Need, for, Personal, Assistance, Serv..."
4,1001.txt,ANNALS OF FAMILY MEDICINE WWW.ANNFAMMED.ORG ...,"(ANNALS, OF, FAMILY, MEDICINE, , WWW.ANNFAMME...","[[FAMILY MEDICINE, MEDICINE, pobj, OF], [AUGUS...","[[FAMILY MEDICINE, ORG, 2, 4], [1, CARDINAL, 9...","[[FAMILY MEDICINE, ORG, 2, 4], [HMO, ORG, 43, ...",ORIGINAL ARTICLE The Epidemiology of Panic Att...,"[(ANNALS, OF, FAMILY, MEDICINE, , WWW.ANNFAMM..."


In [119]:
df.docF[0]

'ORIGINAL ARTICLE The Epidemiology of Panic Attacks , <NER> , and <NER> in <NER> , <NER> ; <NER> , AM ; <NER> , <NER> ; <NER> , <NER> ; <NER> , <NER> ; <NER> , <NER> : Only limited information exists about the epidemiology of <NER> panic attacks ( PAs ) and panic disorder ( <NER> ) . Objective : To present nationally representative data about the epidemiology of PAs and <NER> with or without agoraphobia ( <NER> ) on the basis of <NER> findings . Design and Setting : Nationally representative faceto - face household survey conducted using the fully structured <NER> . Participants : <NER> - speaking respondents ( N=9282 ) <NER> . Main Outcome Measures : RespondentswhometDSMIV lifetime criteria for PAs and <NER> with and without <NER> . Results : <NER> prevalence estimates are <NER> for isolated panic without <NER> only ) , <NER> for <NER> with <NER> without <NER> ) , <NER> for <NER> without <NER> ( <NER> only ) , and <NER> for <NER> ) . <NER> , lifetime number of attacks , and number of 

In [116]:
def NERtag(sents):
    tokens = []
    for i in doc:
    #     print (i.ent_iob_)
        if(i.ent_iob_ == 'O'):
            tokens.append(i.text)
        elif (i.ent_iob_ == 'B'):
            tokens.append('<NER>')
        else:
            pass
    return ' '.join(tokens)
    
        
    
# sents = df['sents'][0]
# for s in sents:
#     tokens = []
#     for i in doc:
#     #     print (i.ent_iob_)
#         if(i.ent_iob_ == 'O'):
#             tokens.append(i.text)
#         elif (i.ent_iob_ == 'B'):
#             tokens.append('<NER>')
#         else:
#             pass
#     ' '.join(tokens)


In [97]:
df['docF'] = [for s in df.doc]

In [100]:
df['NERF'][0]

[['Panic Disorder', 'ORG', 8, 10],
 ['the National Comorbidity Survey Replication Ronald C. Kessler',
  'ORG',
  14,
  22],
 ['MS Context', 'ORG', 51, 53],
 ['DSM-IV', 'ORG', 62, 65],
 ['PD', 'ORG', 74, 75],
 ['PD', 'ORG', 90, 91],
 ['AG', 'ORG', 96, 97],
 ['the US National Comorbidity Survey Replication', 'ORG', 102, 108],
 ['World Health Organization Composite International Diagnostic Interview',
  'ORG',
  126,
  133],
 ['PD', 'ORG', 158, 159],
 ['AG', 'ORG', 162, 163],
 ['Lifetime', 'ORG', 166, 167],
 ['AG (PA', 'ORG', 176, 179],
 ['PA', 'ORG', 185, 186],
 ['AG', 'ORG', 187, 188],
 ['PD (PA-AG', 'ORG', 189, 194],
 ['PD', 'ORG', 199, 200],
 ['AG', 'ORG', 201, 202],
 ['PD', 'ORG', 203, 204],
 ['PD with AG (PD-AG', 'ORG', 211, 218],
 ['Persistence', 'ORG', 220, 221],
 ['DSM-IV', 'ORG', 249, 252],
 ['PD-AG', 'ORG', 259, 262],
 ['PA', 'ORG', 266, 267],
 ['the Panic Disorder Severity Scale', 'ORG', 271, 276],
 ['PD-AG', 'ORG', 280, 283],
 ['PA', 'ORG', 293, 294],
 ['PD-AG', 'ORG', 324, 3

In [83]:
df['sents'] = [[s for s in i.sents] for i in df.doc]

In [84]:
df.sents

0    [(ORIGINAL, ARTICLE), (The, Epidemiology, of, ...
1    [(ORIGINAL, ARTICLE, Lifetime, Prevalence, and...
2    [(MarketWatch, Medicaid, Cost, Containment, An...
3    [(Unmet, Need, for, Personal, Assistance, Serv...
4    [(ANNALS, OF, FAMILY, MEDICINE,  , WWW.ANNFAMM...
Name: sents, dtype: object

In [53]:
df.head(1)

Unnamed: 0,filename,rawContent,doc,NC,NER
0,1003.txt,ORIGINAL ARTICLE The Epidemiology of Panic Att...,"(ORIGINAL, ARTICLE, The, Epidemiology, of, Pan...","[[ORIGINAL ARTICLE, ARTICLE, ROOT, ARTICLE], [...","[[Panic Disorder, ORG, 8, 10], [Agoraphobia, G..."


In [79]:
for idx, row in df.iterrows():
    print(len(row.NC), len(row.NER))
    allText = row.rawContent.split(sep=" ")
    for i in row.NER:
        print(allText[i[2]-1:i[3]-1], i[0])
#         allText[i[2]-1:i[3]-1] = ["<NER>"]
#         print(row['doc'][i[2]:i[3]])
#         break
    print(" ".join(allText))
#     print(row['doc'][i[2]:i[3]])
    break

2443 2209
8 10
['Panic', 'Disorder,'] Panic Disorder
12 13
['in'] Agoraphobia
14 22
['National', 'Comorbidity', 'Survey', 'Replication', 'Ronald', 'C.', 'Kessler,', 'PhD;'] the National Comorbidity Survey Replication Ronald C. Kessler
23 24
['Tat'] PhD
25 28
['AM;', 'Robert', 'Jin,'] Wai Tat Chiu
31 33
['Ruscio,', 'PhD;'] Robert Jin
34 35
['Shear,'] MA
36 39
['Ellen', 'E.', 'Walters,'] Ayelet Meron Ruscio
40 41
['Context:'] PhD
42 44
['limited', 'information'] Katherine Shear
45 46
['about'] MD
47 50
['epidemiology', 'of', 'DSM-IV'] Ellen E. Walters
51 53
['attacks', '(PAs)'] MS Context
62 65
['data', 'about', 'the'] DSM-IV
74 75
['(AG)'] PD
90 91
['representative'] PD
96 97
['the'] AG
102 108
['Composite', 'International', 'Diagnostic', 'Interview.', 'Participants:', 'English-speaking'] the US National Comorbidity Survey Replication
126 133
['without', 'AG.', 'Results:', 'Lifetime', 'prevalence', 'estimates', 'are'] World Health Organization Composite International Diagnostic Intervie