# Begin the actual NLP work

In [384]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 
from fuzzywuzzy import fuzz
import time
import uuid

import os
import json

from datetime import datetime

In [385]:
disaster_summary_preprocessed_file = "D://projects//_external_files//surveyor//rw_disaster_preprocessed//disaster_summaries_preprocessed_9fcc0753cbbf4fb7a37ea5a15f872a11.xlsx"
pcode_file = "D://projects//_external_files//cod_files//combined_locations//locations.csv"

In [386]:
pd.set_option('display.max_columns', None)
print(time.localtime())


time.struct_time(tm_year=2023, tm_mon=12, tm_mday=10, tm_hour=16, tm_min=23, tm_sec=9, tm_wday=6, tm_yday=344, tm_isdst=0)


## Load Location Services

In [387]:
df_location = pd.read_csv(pcode_file)

def get_pcode_from_location(loc, country_prefix='XX', lang_code='all'):

    if country_prefix != 'XX': #if the country prefix is set, limit search to that
        df_loc = df_location[df_location['pcode_prefix'] == country_prefix]
    else:
        df_loc = df_location

    if lang_code != 'all': #secondary filter - especially important to remove dupes with diff langs share the same script
        df_loc = df_loc[df_loc['lang_code'] == lang_code]
        
    matches = df_loc['pcode'][df_loc['location_name'].str.lower() == loc.lower()].tolist()

    #if the match fails, try again on the normalized name
    if len(matches) == 0:
        #remove common variations in names that can cause misses
        n_loc = re.sub(r'[^a-zA-Z]', '', loc)

        #this will cause problems for non-English.. so if then len is 0, exit
        if len(n_loc) == 0:
            return []
            
        matches = df_loc['pcode'][df_loc['location_normalized'].str.lower() == n_loc.lower()].tolist()
        

    #now check results
    if len(matches) > 1:
        #print(f"more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) {matches}")
        #print(f"returning the lowest granularity match. {min(matches, key=len)}")
        #print("if the pcodes are all the same granularity.... you get the first element.")
        return min(matches, key=len)
            
        return matches[0]
    elif len(matches) == 1:
        return matches[0]

    else:
        #couldn't find a match, do a fuzzy search
        compare_list = list(set(df_loc['location_name'].tolist()))
        possible_matches=[]
        for i in compare_list:
            if fuzz.ratio(loc,i) > 70:
                possible_matches.append(i)
                print (f"No exact match to '{loc}'. see if these alternative spellings are correct: {possible_matches}")
        return None

    
    return None

assert get_pcode_from_location('istanbul') == 'TUR034'

def get_adm_lvl_from_pcode(pcode):
    return list(set(df_location['adm_lvl'][df_location['pcode'] == pcode].tolist()))
    
def get_name_in_lang(pcode, lang='en'):
    return list(set(df_location['location_name'][(df_location['pcode'] == pcode) & (df_location['lang_code'] == lang)].tolist()))

def get_descendents_of(pcode, lang='en', include_self=True):
    if include_self==True:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)]
    else:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)\
        & (df_location['pcode'] != pcode)]

def get_admin_chain(pcode, lang='en'):
    split_pcode = df_location['split_pcode'][df_location['pcode'] == pcode].tolist()[0]
    levels = split_pcode.split(".")
    pc =''
    admin_chain = []
    #rebuild the pcode one level at a time
    for i in levels:
        pc = pc + i
        admin_chain.append(df_location['location_name'][(df_location['pcode'] == pc) & (df_location['lang_code'] == lang)].tolist()[0])

    return admin_chain

def get_all_locations(lang_code='all'):

    #return all unique location names
    if lang_code == 'all':
        return list(set(df_location['location_name'].to_list()))
    else:
        return list(set(df_location['location_name'][df_location['lang_code'] == lang_code].to_list()))
    

In [388]:
nlp = spacy.load("en_core_web_sm")

# Create patterns and add to the entity ruler to better find locations

all_locs = get_all_locations(lang_code='en')
gpes = []

STOP_LOCS = ['of','can']
all_locs = [e for e in all_locs if e.lower() not in STOP_LOCS]

# create pattern rules for locations based on the COD files
for l in all_locs:
    token_sequence=[]
    for token in l.split('\s+'):
        token_sequence.append({"LOWER":token.lower()})
    x = {'label':'COD_GPE', 'pattern': token_sequence, 'id':get_pcode_from_location(l, lang_code='en')[0]}
    gpes.append(x)
    #print(get_pcode_from_location(l, lang_code='en'))

ruler = nlp.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(gpes)

## Build DF

In [389]:


df = pd.read_excel(disaster_summary_preprocessed_file)
df = df.fillna('')

In [390]:
def expand_to_sentence_level(doc):
    sentences = []
    #print()
    #print(doc)
    for sent in doc.sents:
        #print(sent)
        #create new doc objects for each sentence and append to a list
        doc_from_span = spacy.tokens.Doc(doc.vocab, words=[token.text for token in sent])
        sentences.append(doc_from_span)

    return sentences


def expand_to_sentence_level(doc):
    sentences = []
    for sent in doc.sents:
        sent_text = sent.text
        if len(sent_text) > 20:
            sentences.append(nlp(sent_text)) # horrendously inefficient but...
    if len(sentences) == 0:
        sentences.append(nlp("No content to return."))
    return sentences

# Function to increment by one for each idx_parad
def generate_sent_id(group, new_column_name='idx_sent'):
    group[new_column_name] = range(0, len(group))
    return group

In [408]:
df.columns

Index(['record_type', 'status', 'source_url', 'glide_id', 'idx_para',
       'source_level_country', 'source_title', 'source_desc',
       'source_original_text', 'reference_url', 'text', 'authoring_org',
       'reported_date', 'references', 'reference_auth_org',
       'reference_date_str', 'reference_date_iso', 'para_id',
       'non_parenthetical_text'],
      dtype='object')

In [392]:
#focus on ongoing for nowd
df_sents = df[df['status'] == 'ongoing'].copy()
df_sents['spacy_para_no_paren'] = df_sents['non_parenthetical_text'].apply(lambda x: nlp(x))
df_sents['spacy_sent_no_paren'] = df_sents['spacy_para_no_paren'].apply(expand_to_sentence_level)
df_sents = df_sents.explode('spacy_sent_no_paren')

# Apply the function to the DataFrame using groupby on 'idx_para'
df_sents = df_sents.groupby(['para_id','idx_para']).apply(generate_sent_id).reset_index(drop=True)
df_sents = df_sents[['glide_id','reference_auth_org','para_id','idx_para','idx_sent','source_level_country','source_original_text','spacy_sent_no_paren','reference_date_iso']]



## Data Structure Completed


In [393]:
#keyword_indicators
indicators = {
    'i_people' : ['people','person','child','man','woman','civilian','colleague','fatality','individual']
    ,'i_killed' : ['dead','fatal','die','kill','deceased','fatality','fatality','death','deaths'] #think about how to incorporate 2 co-existing terms "648 people who lost their lives"
    ,'i_injured' : ['injure','wound','wounded','injured']
    ,'i_damage' : ['damage','destroy','collapse']
    ,'i_health_infrastructure' : ['hospital','surgery']
    ,'i_education_infrastructure' : ['school','university']
    ,'i_cash_xfer' : ['xx']
    ,'i_wash' : ['sanitation','water','sewer','drain','drainage']
    ,'i_shelter' : ['shelter','tent','camp','blanket']
    ,'i_food' : ['food','cook','stove','feed','feed','nutrient','meal']
    ,'i_health' : ['health','medical','medicine']
    ,'i_gender_vuln' : ['dignity','gender','pregnant','lactate','lactating']
    ,'i_protection' : ['trauma','mental']
    ,'i_response_capacity' : ['personnel']
    ,'i_other_infrastructure' : ['communicate','radio','internet','telecommunication','electric','line']
    ,'i_money' : ['grant','loan','finance','appeal','chf','fund']
    ,'i_other' : ['biometric']
    ,'i_problem' : ['challenge']
    ,'i_demand_side' : ['need','demand','gap','priority', 'receive'] # note receive implies both supply and demand
    ,'i_supply_side' : ['response','contribute','provide','source','address','deploy','receive'] # note receive implies both supply and demand

    ,'i_assessments' : ['assess','assessment']
}

In [394]:
def translate_to_int(text):
    if text is None:
        return None
        
    text = re.sub(',', '', text.text)
    try:
        return int(text)
    except:
        return None

def extract_numeric_value(doc, indicator):
    #indicator needs to be either i_killed, or i_injured
    

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    def reset_indicators():
        return -1, -1, -1

    for sent in doc.sents:
        noun, attribute, count = reset_indicators()
    
        
                
        for t in sent:
            #print(t)
            if (t.pos_ == 'NUM') & (t.ent_type_ not in ['DATE','TIME']):
                count = t
    
            if t.lemma_ in indicators[indicator]:
                attribute = t
            if check_flags([attribute,count]):
    
                noun_att_cnt = (attribute,count)
                key_values.append(noun_att_cnt)
                just_count.append(count)
    
                noun, attribute, count = reset_indicators()

    #if more than 1 figure is returned, typically those will be
    #contextualizing numbers, just return the first
    if len(just_count) > 0:
        #return [just_count,key_values]
        return just_count[0]

In [395]:
df_sents['num_killed'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_numeric_value(x, 'i_killed'))
df_sents['num_injured'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_numeric_value(x, 'i_injured'))


## get Location Entities

In [405]:
def extract_gpe_entities(doc, adm_lvl='0'):
    entities = []
    admins = []
    #COD_GPE
    ents = list(extract.entities(doc))
    if len(ents) < 1:
        return None
    else:
        for e in ents:
            if e.label_ == 'COD_GPE':
                entities.append(e)

        for e in entities:
            pcode = get_pcode_from_location(e.text)
            if (pcode is not None):
                if (len(pcode) != 0):
                    #print(pcode)
                    admins.append(get_admin_chain(pcode)[adm_lvl])

    admins = list(set(admins))
    if len(admins) == 0:
        return None
    else:
        return admins[0] 

df_sents['identified_country'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=0))

In [406]:
df_sents['identified_country'] = df_sents['identified_country'].fillna(df_sents['source_level_country'])
df_sents['num_killed_int'] = df_sents['num_killed'].apply(translate_to_int)
df_sents['num_injured_int'] = df_sents['num_injured'].apply(translate_to_int)
#DataFrame.astype(dtype, copy=True, errors='raise')

In [407]:
df_sents.to_excel("c://temp//ongoing.xlsx", index=False)

In [399]:
df_sents[df_sents['glide_id'] == 'EQ-2023-000015-TUR'].to_excel("c://temp//foo2.xlsx")


In [400]:
#df_sents.to_csv("c://temp//foo.csv", index=False, encoding='utf-8-sig')

In [401]:
df_sents[df_sents['glide_id'] == 'EQ-2023-000015-TUR']

Unnamed: 0,glide_id,reference_auth_org,para_id,idx_para,idx_sent,source_level_country,source_original_text,spacy_sent_no_paren,reference_date_iso,num_killed,num_injured,identified_country,num_killed_int,num_injured_int
1746,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_0,0,0,Türkiye,"On 6 February, a 7.7 magnitude earthquake stru...","(On, 6, February, ,, a, 7.7, magnitude, earthq...",2023-02-06,,,Türkiye,,
1747,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_0,0,1,Türkiye,"On 6 February, a 7.7 magnitude earthquake stru...","(This, is, Türkiye, 's, most, powerful, earthq...",2023-02-06,,,Türkiye,,
1748,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_0,0,2,Türkiye,"On 6 February, a 7.7 magnitude earthquake stru...","(The, Government, of, Türkiye, has, since, iss...",2023-02-06,,,Türkiye,,
1749,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_0,0,3,Türkiye,"On 6 February, a 7.7 magnitude earthquake stru...","(The, earthquake, also, heavily, impacted, nor...",2023-02-06,,,[Syrian Arab Republic],,
1750,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_0,0,4,Türkiye,"On 6 February, a 7.7 magnitude earthquake stru...","(The, humanitarian, response, is, largely, ove...",2023-02-06,,,Türkiye,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1906,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_9,9,3,Türkiye,The UN and humanitarian partners are scaling u...,"(At, least, 3, million, people, affected, by, ...",2023-02-28,,,[Syrian Arab Republic],,
1907,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_9,9,4,Türkiye,The UN and humanitarian partners are scaling u...,"(More, than, 4,500, deaths, and, more, than, 8...",2023-02-28,4500,,[Syrian Arab Republic],4500.0,
1908,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_9,9,5,Türkiye,The UN and humanitarian partners are scaling u...,"(The, districts, with, the, highest, number, o...",2023-02-28,,,[Syrian Arab Republic],,
1909,EQ-2023-000015-TUR,OCHA,rwdisastersumm_eq-2023-000015-tur_9,9,6,Türkiye,The UN and humanitarian partners are scaling u...,"(As, of, 26, February, ,, more, than, 1,700, b...",2023-02-28,,,Türkiye,,


In [402]:
x = """More than 4,500 deaths and more than 8,700 injuries due to the earthquakes have been reported in northwest Syria, as of 13 March, according to the Health Cluster."""
doc = nlp(x)

from spacy import displacy

displacy.render(doc, style='ent')

In [403]:
df_location[df_location['location_name'].str.lower() == 'syria']

Unnamed: 0,country,pcode_prefix,location_name,pcode,adm_lvl,lang_code,location_normalized,lvl_pcode_len,split_pcode
10184,Syrian Arab Republic,SY,Syria,SY,0,en,syria,2,SY


In [None]:
print(get_pcode_from_location('Syria'))

In [None]:
        for e in entities:
            pcode = 
            if (pcode is not None):
                if (len(pcode) != 0):
                    #print(pcode)
                    admins.append(get_admin_chain(pcode)[adm_lvl])


def get_pcode_from_location(loc, country_prefix='XX', lang_code='all'):

    if country_prefix != 'XX': #if the country prefix is set, limit search to that
        df_loc = df_location[df_location['pcode_prefix'] == country_prefix]
    else:
        df_loc = df_location

    if lang_code != 'all': #secondary filter - especially important to remove dupes with diff langs share the same script
        df_loc = df_loc[df_loc['lang_code'] == lang_code]
        
    matches = df_loc['pcode'][df_loc['location_name'].str.lower() == 'syria'.lower()].tolist()