# Introduction

### CO-REFERENCING
#### When more parts of the text refer to the same entity, the multiple occurences are said to Co-reference each other

#### Simple Co-referencing types are Anapora , Catapora, Split antecedents and Coreferring noun phrases

 

### Anapora
#### When pronouns/noun-phrases follow their antecedent (nouns/noun-phrases) the referencing is called Anapora
#### E.g. ** Mathew ** is a brilliant student. ** He ** always comes first in class.
##### Here ''He'  trails 'Mathew'

### Catapora
#### When pronouns/noun-phrases lead their antecedent (nouns/noun-phrases) the referencing is called Catapora
#### E.g. If she does not study, Gloria will fail her tests.
##### Here 'she'  leads 'Gloria'

# Coreference resolution of a text

### Here for any piece of text we will follow the following algorithm after choosing which resolution to do(this can also be done via a function though)
####  Algorithm:
####     1.POS-TAGGING
####     2. Forming Chunks OF PRONOUNS , NOUN PHRASES, NOUNS using Names entity recognition and Regex
####     3. Attaching Appropriate NOUN, NOUN-PHRASE TO PRONOUN first by Grammer of Singularity/Plurality and word distance


In [1]:
#Imports
import nltk
from nltk import word_tokenize, sent_tokenize, pos_tag, ne_chunk
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import re
# Global Variables
noun_pronoun_np_list=[]
ne_list=[]


#### Choosing appropriate texts

In [14]:
## Anapora sentences
# txt="The music was so loud that it couldn't be enjoyed"
# txt="Our neighbors dislike the music. If they are angry, the cops will show up soon"
txt = "Mathew  and his friends play together. He loves to play."
##Catapora Senternces
# txt="If they are angry about the music, the neighbors will call the cops"
# txt="If she does not study, Gloria will fail her tests"
##Additional sentence


# Simple POS tagging

In [15]:
def preprocess_pos_func(txt):
    tokenizer = RegexpTokenizer(r'(\w+)')
    words = tokenizer.tokenize(txt)
    pos_txt= nltk.pos_tag(words) 
    return pos_txt
preprocess_pos_func(txt)

[('Mathew', 'NNP'),
 ('and', 'CC'),
 ('his', 'PRP$'),
 ('friends', 'NNS'),
 ('play', 'VBP'),
 ('together', 'RB'),
 ('He', 'PRP'),
 ('loves', 'VBZ'),
 ('to', 'TO'),
 ('play', 'VB')]

In [16]:
def anapora_resolution(txt):
    grammar = "anapora: {<DT>?<PRP.>?<JJ.?>*<NNP?S?>+<.*>*<PRP>+}" 
    cp = nltk.RegexpParser(grammar) 
    result = cp.parse(preprocess_pos_func(txt)) 
    chunkLIST=[]
    for chunk in result:
                if hasattr(chunk, 'label'):
                            chunkLIST.append((' '.join(c[0] for c in chunk),' '.join(c[1] for c in chunk)))
    #Adding the anapora sentence
    a_df=pd.DataFrame(chunkLIST, columns=['ANAPORA', 'POS pattern'])
    a_df_list=[]
    for  index, row in a_df.iterrows():
        a_df_list.append(row['ANAPORA']) 
    return  a_df_list
anapora_resolution(txt)

['Mathew and his friends play together He']

#### For given text anapora resolution is better fit since it adds a lot of noun, pronouns and noun-phrases

In [17]:
def catapora_resolution(txt):
    grammar = "catapora: {<PRP.?>+<.*>*<DT>?<PRP.>?<NNP?S?>+}" 
    cp = nltk.RegexpParser(grammar) 
    result = cp.parse(preprocess_pos_func(txt)) 
    chunkLIST=[]
    for chunk in result:
                if hasattr(chunk, 'label'):
                            chunkLIST.append((' '.join(c[0] for c in chunk),' '.join(c[1] for c in chunk)))
    #Adding the catapora sentence
    c_df=pd.DataFrame(chunkLIST, columns=['CATAPORA', 'POS pattern'])
    c_df_list=[]
    for  index, row in c_df.iterrows():
        c_df_list.append(row['CATAPORA']) 
    return  c_df_list
catapora_resolution(txt)

['his friends']

#### for given text catapora returns a list with one string

### Example of Results:

##### txt="The music was so loud that it couldn't be enjoyed"
##### A="The music was so loud that it"
##### C= ""

##### txt="Our neighbors dislike the music. If they are angry, the cops will show up soon"
##### A="Our neighbors dislike the music If they"
##### C= "Our neighbors dislike the music If they are angry the cops"

##### txt="If she does not study, Gloria will fail her tests"
##### A=""
##### C= "she does not study Gloria will fail her tests"

##### txt="If they are angry about the music, the neighbors will call the cops"
##### A=""
##### C= "they are angry about the music the neighbors will call the cops"



# Calling POS tagging function

# Pronoun chunking and  identification of singular/plural pronouns

In [18]:
def pronoun_chunking(resolution_list):
    for txt in resolution_list:
        pos_txt=preprocess_pos_func(txt)
        ##Regex for pronouns
        grammar = "PRP: {<PRP>+}" 
        
        cp = nltk.RegexpParser(grammar) 
        result = cp.parse(pos_txt) 
        chunkLIST=[]
        for chunk in result:
                    if hasattr(chunk, 'label'):
                            #Is noun phrase singilar or plural
                            np=' '.join(c[0] for c in chunk)
                            if np.lower() in ['them','they']:
                                np_val='Plural'
                                chunkLIST.append((' '.join(c[0] for c in chunk),' '.join(c[1] for c in chunk), np_val))

                            else:
                                np_val='Singular'
                                chunkLIST.append((' '.join(c[0] for c in chunk),' '.join(c[1] for c in chunk), np_val))
        #Adding the pronoun to pronoun, noun-phrase list for co-referencing later
                            noun_pronoun_np_list.append((' '.join(c[0] for c in chunk)))
    p_df=pd.DataFrame(chunkLIST, columns=['N_Pronoun_NP', 'POS pattern', 'Number'])
    return p_df

p_df=pronoun_chunking(anapora_resolution(txt))
p_df

Unnamed: 0,N_Pronoun_NP,POS pattern,Number
0,He,PRP,Singular


# Noun-Phrase chunking and identifying singular/plural

In [19]:
def noun_phrase_chunking(resolution_list):
    for txt in resolution_list:
        pos_txt=preprocess_pos_func(txt)
        ##Regex for chunking noun phrase as addition of optional possessive pronoun,optional determinant, optional adjective and atleast 1 noun
        grammar = "NP: {<PRP.>?<DT>?<JJ.?>*<NNS?>+}" 
        cp = nltk.RegexpParser(grammar) 
        result = cp.parse(pos_txt) 
        chunkLIST=[]

        for chunk in result:
                    if hasattr(chunk, 'label'):
                            #Is noun phrase singilar or plural
                            np=' '.join(c[1] for c in chunk)
                            if re.match(r"(.*\s)*(NNS|NNPS)$",np):
                                np_val='Plural'
                            else:
                                np_val='Singular'

                            chunkLIST.append((' '.join(c[0] for c in chunk),' '.join(c[1] for c in chunk), np_val))

                            noun_pronoun_np_list.append((' '.join(c[0] for c in chunk)))
        np_df=pd.DataFrame(chunkLIST, columns=['N_Pronoun_NP', 'POS pattern', 'Number'])
    return np_df
np_df=noun_phrase_chunking(anapora_resolution(txt))
np_df

Unnamed: 0,N_Pronoun_NP,POS pattern,Number
0,his friends,PRP$ NNS,Plural


In [20]:
p_np_df=p_df.append(np_df)
p_np_df

Unnamed: 0,N_Pronoun_NP,POS pattern,Number
0,He,PRP,Singular
0,his friends,PRP$ NNS,Plural


# Get entity names and singular plural:

In [21]:
def ne_chunking(resolution_list):
    for txt in resolution_list:
        pos_txt=preprocess_pos_func(txt)
        chunkLIST=[]
        new_txt=txt
        pos_new_txt=preprocess_pos_func(new_txt)


        #Tokenizing into sentences
        
        for sent in nltk.sent_tokenize(new_txt):
            #Tokenizing into tokens/words and assigning chunks to named entities
                for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                    if hasattr(chunk, 'label'):
                        if chunk[0][1].lower()=='nnps':
                            chunkLIST.append((' '.join(c[0] for c in chunk),  chunk[0][1],'Plural' ))
                            ne_list.append((' '.join(c[0] for c in chunk)))
                            noun_pronoun_np_list.append((' '.join(c[0] for c in chunk)))

                        else:
                            chunkLIST.append((' '.join(c[0] for c in chunk),  chunk[0][1],'Singular' ))
                            ne_list.append((' '.join(c[0] for c in chunk)))
                            noun_pronoun_np_list.append((' '.join(c[0] for c in chunk)))

        ne_df=pd.DataFrame(chunkLIST, columns=['N_Pronoun_NP', 'POS pattern', 'Number'])
    return ne_df
ne_df=ne_chunking(anapora_resolution(txt))
ne_df          

Unnamed: 0,N_Pronoun_NP,POS pattern,Number
0,Mathew,NNP,Singular


# Add to noun pronoun noun phrase dataframe

In [22]:
n_p_np_df=p_np_df.append(ne_df)
n_p_np_df

Unnamed: 0,N_Pronoun_NP,POS pattern,Number
0,He,PRP,Singular
0,his friends,PRP$ NNS,Plural
0,Mathew,NNP,Singular


# Process to assign index and chunk noun phrases as one index

# Remove punctuations from the primary sentence

In [23]:
def chunks_index_assign(txt,n_p_np_df):

    chars = [',', '.', '!','?','(',')','&', '-',':', ';']
    retxt=txt.translate({ord(k): None for k in chars})
    print('Sentence without punctuations.\n')
    print(retxt+'\n\n')

    for val in noun_pronoun_np_list:
        retxt = retxt.replace(val,(val.replace(" ","-")));
    print('Hyphenating noun-phrases as one unit\n')
    print(retxt+'\n\n')

    retxt_pos=retxt.split()
    print('Splitting the sentence in terms of chunks\n')
    print(retxt_pos)
    print('\n\n')

    #Replacing hyphen with " "
    for i in range (len(retxt_pos)):
        retxt_pos[i]=retxt_pos[i].replace("-"," ")
    print('De-hyphenating to obtain actual chunks\n')
    print(retxt_pos)
    print('\n\n')

    # Assigning positions in sentences to noun pronoun and np df

    indices=[]
    for index, row in n_p_np_df.iterrows():
        for i in range (len(retxt_pos)):
            if row['N_Pronoun_NP']==retxt_pos[i]:
                indices.append(i) 

    n_p_np_df['Position'] = indices

    n_p_np_df=n_p_np_df.reset_index(drop=True)
    return retxt_pos,n_p_np_df

retxt_pos,n_p_np_df_ind=chunks_index_assign(txt,n_p_np_df)
print(retxt_pos)
n_p_np_df_ind

Sentence without punctuations.

Mathew  and his friends play together He loves to play


Hyphenating noun-phrases as one unit

Mathew  and his-friends play together He loves to play


Splitting the sentence in terms of chunks

['Mathew', 'and', 'his-friends', 'play', 'together', 'He', 'loves', 'to', 'play']



De-hyphenating to obtain actual chunks

['Mathew', 'and', 'his friends', 'play', 'together', 'He', 'loves', 'to', 'play']



['Mathew', 'and', 'his friends', 'play', 'together', 'He', 'loves', 'to', 'play']


Unnamed: 0,N_Pronoun_NP,POS pattern,Number,Position
0,He,PRP,Singular,5
1,his friends,PRP$ NNS,Plural,2
2,Mathew,NNP,Singular,0


### Assigning nearest references to the pronoun/noun phrase

In [24]:
def find_nearest_reference(n_p_np_df,retxt_pos):
    Nearest_Reference=[]
    for index, row in n_p_np_df.iterrows():
        min_d=len(retxt_pos)
        min_d_ind=index
        for index2, row2 in n_p_np_df.iterrows():
            bool1=bool(re.match(r"(PRP)",row['POS pattern'])) and (bool(re.match(r"(PRP)",row2['POS pattern']))==False)
            bool2= bool(re.match(r"(PRP)",row2['POS pattern'])) and (bool(re.match(r"(PRP)",row['POS pattern']))==False)
            print(bool1|bool2)

            if (index!=index2) and (bool1 | bool2):
                    if ((abs(row['Position']-row2['Position']))<min_d) and (row['POS pattern']!= row2['POS pattern']) and (row['Number']==row2['Number']):
                        min_d=abs(row['Position']-row['Position']) 
                        min_d_ind=index2
        Nearest_Reference.append(n_p_np_df['N_Pronoun_NP'][min_d_ind])
    n_p_np_df['Nearest_Reference'] = Nearest_Reference
    return n_p_np_df

ref_n_p_np_df_ind=find_nearest_reference(n_p_np_df_ind,retxt_pos)
ref_n_p_np_df_ind

False
False
True
False
False
True
True
True
False


Unnamed: 0,N_Pronoun_NP,POS pattern,Number,Position,Nearest_Reference
0,He,PRP,Singular,5,Mathew
1,his friends,PRP$ NNS,Plural,2,his friends
2,Mathew,NNP,Singular,0,He


In [26]:
# Replace sentence with references
for i in range (len(retxt_pos)):
       for index, row in ref_n_p_np_df_ind.iterrows():
            if (re.match(r"(.?)*(PRP.?)+",row['POS pattern'])) and (retxt_pos[i]==row['N_Pronoun_NP']):
                retxt_pos[i]=row['Nearest_Reference']
        
sentence=' '.join(retxt_pos)
sentence

'Mathew and his friends play together Mathew loves to play'

In [133]:
#Nuances like our son, her difficulty are still difficult to do coreference
#Gender classification still an issue