# Begin the actual NLP work

In [1]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 
from fuzzywuzzy import fuzz
import time
import uuid

import os
import json

from datetime import datetime

In [2]:
sitrep_preprocessed_file = "D://projects//_external_files//surveyor//rw_sitrep_preprocessed//sitrep_preprocessed_b41b8e78f66d4e669917ea831f438b73.xlsx"
pcode_file = "D://projects//_external_files//cod_files//combined_locations//locations.csv"



In [3]:
pd.set_option('display.max_columns', None)
print(time.localtime())

time.struct_time(tm_year=2024, tm_mon=1, tm_mday=1, tm_hour=5, tm_min=40, tm_sec=2, tm_wday=0, tm_yday=1, tm_isdst=0)


## Load Location Services

In [4]:
df_location = pd.read_csv(pcode_file)

def get_pcode_from_location(loc, country_prefix='XX', lang_code='all'):

    if country_prefix != 'XX': #if the country prefix is set, limit search to that
        df_loc = df_location[df_location['pcode_prefix'] == country_prefix]
    else:
        df_loc = df_location

    if lang_code != 'all': #secondary filter - especially important to remove dupes with diff langs share the same script
        df_loc = df_loc[df_loc['lang_code'] == lang_code]
        
    matches = df_loc['pcode'][df_loc['location_name'].str.lower() == loc.lower()].tolist()

    #if the match fails, try again on the normalized name
    if len(matches) == 0:
        #remove common variations in names that can cause misses
        n_loc = re.sub(r'[^a-zA-Z]', '', loc)

        #this will cause problems for non-English.. so if then len is 0, exit
        if len(n_loc) == 0:
            return []
            
        matches = df_loc['pcode'][df_loc['location_normalized'].str.lower() == n_loc.lower()].tolist()
        

    #now check results
    if len(matches) > 1:
        #print(f"more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) {matches}")
        #print(f"returning the lowest granularity match. {min(matches, key=len)}")
        #print("if the pcodes are all the same granularity.... you get the first element.")
        return min(matches, key=len)
            
        return matches[0]
    elif len(matches) == 1:
        return matches[0]

    else:
        #couldn't find a match, do a fuzzy search
        compare_list = list(set(df_loc['location_name'].tolist()))
        possible_matches=[]
        for i in compare_list:
            if fuzz.ratio(loc,i) > 70:
                possible_matches.append(i)
                print (f"No exact match to '{loc}'. see if these alternative spellings are correct: {possible_matches}")
        return None

    
    return None

assert get_pcode_from_location('istanbul') == 'TUR034'

def get_adm_lvl_from_pcode(pcode):
    return list(set(df_location['adm_lvl'][df_location['pcode'] == pcode].tolist()))
    
def get_name_in_lang(pcode, lang='en'):
    return list(set(df_location['location_name'][(df_location['pcode'] == pcode) & (df_location['lang_code'] == lang)].tolist()))

def get_descendents_of(pcode, lang='en', include_self=True):
    if include_self==True:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)]
    else:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)\
        & (df_location['pcode'] != pcode)]

def get_admin_chain(pcode, lang='en'):
    split_pcode = df_location['split_pcode'][df_location['pcode'] == pcode].tolist()[0]
    levels = split_pcode.split(".")
    pc =''
    admin_chain = []
    #rebuild the pcode one level at a time
    for i in levels:
        pc = pc + i
        admin_chain.append(df_location['location_name'][(df_location['pcode'] == pc) & (df_location['lang_code'] == lang)].tolist()[0])

    return admin_chain

def get_all_locations(lang_code='all'):

    #return all unique location names
    if lang_code == 'all':
        return list(set(df_location['location_name'].to_list()))
    else:
        return list(set(df_location['location_name'][df_location['lang_code'] == lang_code].to_list()))
    

In [5]:
nlp = spacy.load("en_core_web_sm")

# Create patterns and add to the entity ruler to better find locations

all_locs = get_all_locations(lang_code='en')
gpes = []

STOP_LOCS = ['of','can']
all_locs = [e for e in all_locs if e.lower() not in STOP_LOCS]

# create pattern rules for locations based on the COD files
for l in all_locs:
    token_sequence=[]
    for token in l.split('\s+'):
        token_sequence.append({"LOWER":token.lower()})
    x = {'label':'COD_GPE', 'pattern': token_sequence, 'id':get_pcode_from_location(l, lang_code='en')[0]}
    gpes.append(x)
    #print(get_pcode_from_location(l, lang_code='en'))

ruler = nlp.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(gpes)

## Build the DF

In [6]:

df = pd.read_excel(sitrep_preprocessed_file)
df = df.fillna('')



In [7]:
# Narrow the scope for easier testing
df = df[(df['glide_id'] == 'EQ-2023-000015-TUR') | (df['glide_id'] == 'EQ-2023-000214-NPL')]
df = df[(df['glide_id'] == 'EQ-2023-000015-TUR')] # | (df['glide_id'] == 'EQ-2023-000214-NPL')]
set(df['glide_id'].tolist())


{'EQ-2023-000015-TUR'}

In [8]:
def expand_to_sentence_level(doc):
    sentences = []
    #print()
    #print(doc)
    for sent in doc.sents:
        #print(sent)
        #create new doc objects for each sentence and append to a list
        doc_from_span = spacy.tokens.Doc(doc.vocab, words=[token.text for token in sent])
        sentences.append(doc_from_span)

    return sentences


def expand_to_sentence_level(doc):
    sentences = []
    for sent in doc.sents:
        sent_text = sent.text
        if len(sent_text) > 20:
            sentences.append(nlp(sent_text)) # horrendously inefficient but...
    if len(sentences) == 0:
        sentences.append(nlp("No content to return."))
    return sentences

# Function to increment by one for each idx_parad
def generate_sent_id(group, new_column_name='idx_sent'):
    group[new_column_name] = range(0, len(group))
    return group

In [9]:
def generate_uuid(x):
    return uuid.uuid4().hex




In [10]:
#focus on ongoing for nowd
df_sents = df.copy()
df_sents['spacy_para_no_paren'] = df_sents['non_parenthetical_text'].apply(lambda x: nlp(x))
df_sents['spacy_sent_no_paren'] = df_sents['spacy_para_no_paren'].apply(expand_to_sentence_level)
df_sents = df_sents.explode('spacy_sent_no_paren')
#generate a guid by sentence
df_sents['guid_sent'] = df_sents['spacy_sent_no_paren'].apply(generate_uuid)

# Apply the function to the DataFrame using groupby on 'idx_para'
df_sents = df_sents.groupby(['para_id','idx_para']).apply(generate_sent_id).reset_index(drop=True)

#to limit the fields but this just seems to cause problems
#df_sents = df_sents[['glide_id','source_level_country','authoring_org','para_id','idx_para','idx_sent','source_original_text','spacy_sent_no_paren','reported_date']]


## Data Structure Completed

In [11]:
#keyword_indicators
indicators = {
    'i_people' : ['people','person','child','man','woman','civilian','colleague','fatality','individual']
    ,'i_killed' : ['dead','fatal','die','kill','deceased','fatality','fatality','death','deaths'] #think about how to incorporate 2 co-existing terms "648 people who lost their lives"
    ,'i_injured' : ['injure','wound','wounded','injured']
    ,'i_damage' : ['damage','destroy','collapse','damaged']
    ,'i_infrastructure' : ['hospital','school','university','dam','bridge','road','highway']
    ,'i_cva' : ['xx']
    ,'i_wash' : ['sanitation','water','sewer','drain','drainage']
    ,'i_shelter' : ['shelter','tent','camp','blanket']
    ,'i_food' : ['food','cook','stove','feed','feed','nutrient','meal']
    ,'i_logistic' : ['logistic','logistics','road']
    ,'i_health' : ['health','medical','medicine','surgery']
    ,'i_gender_pss' : ['dignity','gender','pregnant','lactate','lactating']
    ,'i_protection' : ['trauma','mental','disable','disability']
    #,'i_response_capacity' : ['personnel']
    ,'i_response' : ['personnel']
    ,'i_other_infrastructure' : ['communicate','radio','internet','telecommunication','electric','line']
    ,'i_money' : ['grant','loan','finance','appeal','chf','fund']
    ,'i_other' : ['biometric']
    ,'i_problem' : ['challenge','gap','need_to','lack']
    ,'i_demand_side' : ['need','demand','gap','priority', 'receive','shortage','lack'] # note receive implies both supply and demand
    ,'i_supply_side' : ['response','contribute','provide','source','address','deploy','receive'] # note receive implies both supply and demand
    ,'i_tense_future' : ['xx'] #will populate this from future-tense indicator function

    ,'i_assessments' : ['assess','assessment']
}
file = "D://projects//_external_files//surveyor//word_indicators.xlsx"

def augment_indicators(indicators, file):
    df = pd.read_excel(file)
    for c in df.columns:
        if c[0:2] == 'i_':
            w_list = df['word'][df[c] == 1].tolist()
            try:
                indicators[c].extend(w_list)
            except:
                indicators[c] = w_list

    return indicators

indicators = augment_indicators(indicators, file)


## Data Structure Created

In [12]:
def extract_gpe_entities(doc, adm_lvl='0'):
    #values for adm_lvl = 0,1,2,3,
    # -1 = self, -99 = chain
    entities = []
    admins = []
    #COD_GPE
    ents = list(extract.entities(doc))
    if len(ents) < 1:
        return None
    else:
        for e in ents:
            if e.label_ == 'COD_GPE':
                entities.append(e)

        # -1 means return the actual gpes
        if adm_lvl == -1:
            return entities

        for e in entities:
            pcode = get_pcode_from_location(e.text)
            if (pcode is not None):
                if (len(pcode) != 0):
                    #if the desired level is lower than the actual reference
                    #ignore
                    try:
                        if adm_lvl == -99:
                            admins.append(get_admin_chain(pcode))
                        else:
                            admins.append(get_admin_chain(pcode)[adm_lvl])
                    except:
                        pass

    if adm_lvl != -99:
        admins = list(set(admins))
    if len(admins) == 0:
        return None
    else:
        #changing to return the full list, then can explode later
        return admins #[0] 

df_sents['identified_gpes'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=-1))
df_sents['identified_country'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=0))
df_sents['identified_adm_01'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=1))
df_sents['identified_adm_02'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=2))
df_sents['identified_adm_chain'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=-99))

In [13]:
def find_and_add_indicator(df, indicators):
    ind_counter = []
    for ind in indicators:
  
        df[ind] = df['lower_lemmas'].apply(lambda x: 1 if len([w for w in x if w in indicators[ind]])>0 else 0)
        ind_counter.append(ind)
        #print(ind_counter)
    df['i_count'] = df[ind_counter].sum(axis=1)

    return df

In [14]:
df_sents['lower_lemmas'] = df_sents['spacy_sent_no_paren'].apply(lambda x: [w.lemma_.lower() for w in x])
df_sents = find_and_add_indicator(df_sents, indicators)

In [15]:
df_sents.to_excel("c://temp//training.xlsx")

In [15]:
# separate the adm01 data and explode it for rejoining later
# because when I save a list in a df to excel it becomes a string
# that looks like a list
df_identified_locations = df_sents[['identified_adm_01','guid_sent']][df_sents['identified_adm_01'].isna() == False]
df_identified_locations = df_identified_locations.explode('identified_adm_01')
df_identified_locations['identified_adm_01'] = df_identified_locations['identified_adm_01'].apply(lambda x: x.title())
output_file = "D://projects//pythonProject//streamlit_surveyor//data//identified_locations.xlsx"
df_identified_locations.to_excel(output_file, index=False)

## Layer on additional interpretations

In [16]:
df_sents.glide_id.unique()

array(['EQ-2023-000015-TUR'], dtype=object)

In [17]:
  

def extract_verbs(doc):
    verbs=[]
    for t in doc:
        if t.pos_ == 'VERB':
            verbs.append(t)

    return verbs

df_sents['verbs'] = df_sents['spacy_sent_no_paren'].apply(extract_verbs)



In [18]:
def get_future_tense_verb(doc):
    def is_future_tense(token):
        #Check if a token is indicative of future tense.
        return (
            token.tag_ == "MD" and token.text.lower() == "will"
            or (token.dep_ == "aux" and token.head.lemma_ == "will")
            or (token.pos_ == 'VERB' and token.head.text == "going" and  "Inf" in token.morph.get("VerbForm"))
        )

    for t in doc:
        if is_future_tense(t):
            return f"{t.text} {t.head}"

    

def declare_primary_record_type(row):

    if row['i_count'] == 0:
        return 'background'
    elif row['i_supply_side']:
        return 'response_details'
    elif row['i_demand_side']:
        return 'demand_side'
    elif row[['i_damage','i_health_infrastructure','i_education_infrastructure']].sum() > 0:
        return 'damage_to_homes_and_infrastructure'
    else:
        return 'other'


df_sents['svot'] = df_sents['spacy_sent_no_paren'].apply(lambda doc: list(extract.subject_verb_object_triples(doc)))
df_sents['future_verbs'] = df_sents['spacy_sent_no_paren'].apply(get_future_tense_verb)
df_sents['i_tense_future'] = df_sents['future_verbs'].apply(lambda x: 1 if x is not None else 0)
#df_sents['collected_indicators'] = df_sents.apply(get_indicators)

In [19]:
### group all the expressed indicators
def get_indicator_columns(df):
    inds=[]
    for c in df.columns:
        if c[0:2] == 'i_':
            inds.append(c)

    return inds

indicator_columns = get_indicator_columns(df_sents)

def find_matching_columns(row):
    return row.index[row.eq(1)].tolist()

# Create a new column containing lists of matching column names for each row
df_sents['collected_indicators'] = df_sents[indicator_columns].apply(find_matching_columns, axis=1)
df_sents.sample(2)

Unnamed: 0,record_type,source_url,glide_id,idx_para,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date,para_id,non_parenthetical_text,spacy_para_no_paren,spacy_sent_no_paren,guid_sent,idx_sent,identified_gpes,identified_country,identified_adm_01,identified_adm_02,identified_adm_chain,lower_lemmas,i_people,i_killed,i_injured,i_damage,i_infrastructure,i_cva,i_wash,i_shelter,i_food,i_logistic,i_health,i_gender_pss,i_protection,i_response,i_other_infrastructure,i_money,i_other,i_problem,i_demand_side,i_supply_side,i_tense_future,i_assessments,i_commodity_market,i_displacement,i_authority,i_statement_certainty,i_severity,i_change_increase,i_change_decrease,i_change_steady,i_geography,i_violence,i_count,verbs,svot,future_verbs,collected_indicators
4968,situation report,https://api.reliefweb.int/v1/reports/3932844,EQ-2023-000015-TUR,5,Syria,Syria: 6th February Earthquake – Situation Rep...,food and nutrition; health; shelter and non-fo...,,https://reliefweb.int/attachments/de89e263-837...,,SI,2023-02-06,rwsitrep_https://reliefweb.int/attachments/de8...,,(),"(No, content, to, return, .)",a6ecc1957a4a4d24b37220d0de42f7b3,0,,,,,,"[no, content, to, return, .]",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,[return],[],,"[i_people, i_count]"
959,situation report,https://api.reliefweb.int/v1/reports/3932832,EQ-2023-000015-TUR,11,Syria,North-West Syria: Situation Report (11 Februar...,education; food and nutrition; health; logisti...,"Dr Ahmed Haji Hassan, a regional health manage...",https://reliefweb.int/attachments/2a854c2f-e1b...,"Dr Ahmed Haji Hassan, a regional health manage...",OCHA,2023-02-12,rwsitrep_https://reliefweb.int/attachments/2a8...,"Dr Ahmed Haji Hassan, a regional health manage...","(Dr, Ahmed, Haji, Hassan, ,, a, regional, heal...","(We, need, mobile, clinics, to, reach, the, in...",6b498fb73c1a4a9296417506c3491947,2,,,,,,"[we, need, mobile, clinic, to, reach, the, inj...",0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,5,"[need, reach]","[([We], [need], [clinics])]",,"[i_injured, i_logistic, i_response, i_demand_s..."


In [48]:
output_file = "D://projects//pythonProject//streamlit_surveyor//data//situation_reports.xlsx"
df_sents.to_excel(output_file, index=False)

# Restructure the dictionary to create rows from the lists
restructured_dict = {
    'indicator': [],
    'word': []
}

for category, items in indicators.items():
    for item in items:
        restructured_dict['indicator'].append(category)
        restructured_dict['word'].append(item)

# Convert the restructured dictionary to a DataFrame
df_inds = pd.DataFrame.from_dict(restructured_dict)



#df_inds = pd.DataFrame.from_dict(indicators, orient='index')
df_inds

output_file = "D://projects//pythonProject//streamlit_surveyor//data//indicator_words.xlsx"
df_inds.to_excel(output_file, index=False)


## Noun Chunks by Day

In [20]:
def stringify_terms(x):
    try:
        string_term = ' '.join([i.text.lower() for i in x]).strip()
    except:
        string_term = ''
        
    return string_term
    
df_sents['key_terms'] = df_sents['spacy_sent_no_paren'].apply(lambda x: list(extract.noun_chunks(x)))
df_sents.sample(2)
df_noun_chunks = df_sents[['reported_date','authoring_org','key_terms']].copy()


df_noun_chunks = df_noun_chunks.explode('key_terms')
df_noun_chunks['str_key_terms'] = df_noun_chunks['key_terms'].apply(stringify_terms)
df_noun_chunks

Unnamed: 0,reported_date,authoring_org,key_terms,str_key_terms
0,2023-05-11,UNICEF,(content),content
1,2023-05-11,UNICEF,"(Around, 2.4, million, people)",around 2.4 million people
1,2023-05-11,UNICEF,"(formal, sites)",formal sites
1,2023-05-11,UNICEF,"(informal, settlements)",informal settlements
2,2023-05-11,UNICEF,"(key, needs)",key needs
...,...,...,...,...
5611,2023-02-15,UNRWA,"(Recreational, activities)",recreational activities
5611,2023-02-15,UNRWA,"(65, children)",65 children
5611,2023-02-15,UNRWA,(Hama),hama
5612,2023-02-15,UNRWA,(content),content


In [22]:
grouped = df_noun_chunks.groupby(['reported_date','str_key_terms']).size().reset_index(name='term_count')
# Displaying the grouped DataFrame
grouped[(grouped['term_count']>1) & (grouped['term_count']<25)].sort_values(by='term_count', ascending=False).head(20)

output_file = "D://projects//pythonProject//streamlit_surveyor//data//key_terms_by_day.xlsx"
grouped.to_excel(output_file, index=False)

## Now do analysis


In [22]:
def enhance_noun_chunk_info(doc,guid_sent):

    df_temp = pd.DataFrame(columns=['guid_sent','spacy_sent_no_paren','nc','nc_lower_lemmas','contains_stop','contains_non_alpha','head_type','rights','lefts','svot'])

    #function to get SVOTs
    def extract_svots(doc):
        tmp_df = pd.DataFrame(columns=['sub_span_begin','sub_span_end','subject','verb','object','svot'])
        svot = list(extract.subject_verb_object_triples(doc))
        if len(svot) > 0:
            #print(svot)
            #for each specific svot, then get the index of the beginning and end
            for s in svot:
                subject = s[0]
                begin = subject[0].i
                end = subject[-1].i

                tmp_df.loc[len(tmp_df)] = [begin,end,subject,s[1],s[2],s]
        return tmp_df
                
    def check_for_svots(nc, svot_df):
        #get the begin and end indices
        nc_begin = nc[0].i
        nc_end = nc[-1].i

        res = svot_df[(svot_df['sub_span_begin'] >= nc_begin) &
            (svot_df['sub_span_end'] <= nc_end)]
        #print(f"RES::: {svot_df}")

        res_list = res['svot'].tolist()
        return res_list

    

    
    #doc = nlp(sent)
    df_svots = extract_svots(doc)
    #print(df_svots)
    
    #print(sent)
    #print(list(extract.noun_chunks(doc)))
    
    #svot = list(extract.subject_verb_object_triples(doc))
    
    x = list(extract.noun_chunks(doc))

    
    for nc in x:
        head_type = []
        rights = []
        lefts = []
        h = ''
        contains_stop = False
        contains_non_aplha = False
        lower_lemmas = []



        #see if there are any matches in the svot table
        svot_content = check_for_svots(nc,df_svots)
        
        for w in nc:
            #print(w.i)
            lower_lemmas.append(w.lemma_.lower())
            if w.is_stop == True:
                contains_stop = True
            if w.is_alpha == False:
                contains_non_aplha = True
            if w.head.text == 'are':
                head_type.append('TEXT:ARE')
                h = w.head
                rights.append(list(h.rights))
                #print(f"ARE -- {list(h.rights)} -- {nc}\n")
            if w.head.pos_ == 'AUX':
                head_type.append('POS:AUX')
                h = w.head
                rights.append(list(h.rights))
                #print(f"POS_AUX -- {list(h.rights)} -- {nc}\n")
        ll = ' '.join(lower_lemmas).strip()
            
            
            

        df_temp.loc[len(df_temp)] = [guid_sent,doc,nc,ll,contains_stop,contains_non_aplha,head_type,rights,lefts,svot_content]
    return df_temp

In [23]:
#build a noun_chunk df
df_ncs = pd.DataFrame(columns=['guid_sent','spacy_sent_no_paren','nc','nc_lower_lemmas','contains_stop','contains_non_alpha','head_type','rights','lefts','svot'])

#df_ncs[['guid_sent','spacy_sent_no_paren']] = df_sents[['guid_sent','spacy_sent_no_paren']].copy()

for index, row in df_sents[['guid_sent','spacy_sent_no_paren']].iterrows():
    df_temp_nc = enhance_noun_chunk_info(row['spacy_sent_no_paren'],row['guid_sent'])
    df_ncs = pd.concat([df_ncs,df_temp_nc])
    
#df_ncs = df_ncs[['guid_sent','spacy_sent_no_paren']].apply(lambda x: enhance_noun_chunk_info(x['spacy_sent_no_paren'],x['guid_sent']))



In [24]:
def extract_5ws(doc):
    init_i = 0
    f=0
    what=[]
    why=[]
    for t in doc:
        if (t.pos_ in ['CCONJ','SCONJ']) & (t.text.lower() in ['so']) |\
            (t.text.lower() in ['therefore']):
            # if the word is therefore or so, the causal factor is left of the word
            # and the assertion is right of the word
            what.append(doc[t.i:])
            why.append(doc[init_i:t.i])
            #print(f"      THE CAUSATION: {doc[init_i:t.i]}")
            #print(f"      THE ASSERTION: {doc[t.i:]}")
            
            init_i = t.i
            f=1
        elif (t.text.lower() in ['because']):
            # if the word is because, the causal factor is right of the word
            # and the assertion is left of the word
            # UNLESS because is at the beginning of the sentence. Use case
            # not addressed yet
            #print(t.text.upper())
            #print(f"      THE ASSERTION: {doc[init_i:t.i]}")
            #print(f"      THE CAUSATION: {doc[t.i:]}")
            what.append(doc[init_i:t.i])
            why.append(doc[t.i:])
            
            init_i = t.i
            f=1

            
    if f == 1:
        return pd.Series({'what':what, 'why':why})
    else:
        return pd.Series({'what':None, 'why':None})

df_sents[['what','why']] = df_sents['spacy_sent_no_paren'].apply(extract_5ws)

In [36]:
df_sents.columns

Index(['record_type', 'source_url', 'glide_id', 'idx_para',
       'source_level_country', 'source_title', 'source_desc',
       'source_original_text', 'reference_url', 'text', 'authoring_org',
       'reported_date', 'para_id', 'non_parenthetical_text',
       'spacy_para_no_paren', 'spacy_sent_no_paren', 'guid_sent', 'idx_sent',
       'identified_gpes', 'identified_country', 'identified_adm_01',
       'identified_adm_02', 'identified_adm_chain', 'lower_lemmas', 'i_people',
       'i_killed', 'i_injured', 'i_damage', 'i_infrastructure', 'i_cva',
       'i_wash', 'i_shelter', 'i_food', 'i_logistic', 'i_health',
       'i_gender_pss', 'i_protection', 'i_response', 'i_other_infrastructure',
       'i_money', 'i_other', 'i_problem', 'i_demand_side', 'i_supply_side',
       'i_tense_future', 'i_assessments', 'i_commodity_market',
       'i_displacement', 'i_authority', 'i_statement_certainty', 'i_severity',
       'i_change_increase', 'i_change_decrease', 'i_change_steady',
       'i_ge

In [46]:
df_sents[['glide_id','reported_date','source_title','authoring_org']].drop_duplicates().groupby(['glide_id','reported_date','authoring_org'])['source_title'].count()

glide_id            reported_date  authoring_org   
EQ-2023-000015-TUR  2023-02-06     ACU                 1
                                   DFS                 1
                                   Human Initiative    2
                                   IMC                 1
                                   OCHA                3
                                                      ..
                    2023-11-24     UNICEF              1
                    2023-11-29     UNHCR               1
                    2023-12-01     USAID               1
                    2023-12-04     OCHA                1
                    2023-12-07     SARD                1
Name: source_title, Length: 269, dtype: int64

In [24]:
df_sents.to_excel("c://temp//whatwhy.xlsx")

In [79]:
df_ncs.to_excel("c://temp//foo.xlsx")

In [31]:
df_ncs['spacy_sent_no_paren'][df_ncs['guid_sent'] == '4ce132bab4904dbf934d24f3dd47744f'].tolist()[0]

Some private hospitals refused to admit the injured unless paid, so it is necessary to contact private hospitals to cover the cost of treatment for the injured as they are equipped with better facilities and have greater amounts of medical equipment.

In [27]:
df_ncs['svot'][df_ncs['guid_sent'] == '4ce132bab4904dbf934d24f3dd47744f'].tolist()

[[SVOTriple(subject=[hospitals], verb=[refused], object=[to, admit, the, injured, unless, paid])],
 [],
 [],
 [],
 [],
 [SVOTriple(subject=[they], verb=[are, equipped], object=[amounts]),
  SVOTriple(subject=[they], verb=[have], object=[amounts])],
 [],
 [],
 []]

In [24]:
df_ncs[df_ncs['guid_sent'] == '4ce132bab4904dbf934d24f3dd47744f']

Unnamed: 0,guid_sent,spacy_sent_no_paren,nc,nc_lower_lemmas,contains_stop,contains_non_alpha,head_type,rights,lefts,svot
0,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...","(private, hospitals)",private hospital,False,False,[],[],[],"[([hospitals], [refused], [to, admit, the, inj..."
1,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...",(it),it,True,False,[POS:AUX],"[[necessary, contact, .]]",[],[]
2,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...","(private, hospitals)",private hospital,False,False,[],[],[],[]
3,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...",(cost),cost,False,False,[],[],[],[]
4,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...",(treatment),treatment,False,False,[],[],[],[]
5,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...",(they),they,True,False,[],[],[],"[([they], [are, equipped], [amounts]), ([they]..."
6,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...","(better, facilities)",well facility,False,False,[],[],[],[]
7,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...","(greater, amounts)",great amount,False,False,[],[],[],[]
8,4ce132bab4904dbf934d24f3dd47744f,"(Some, private, hospitals, refused, to, admit,...","(medical, equipment)",medical equipment,False,False,[],[],[],[]


In [49]:
df_ncs.to_excel("c://temp//ncses.xlsx")

In [22]:
#df_sents[['spacy_sent_no_paren','future_verbs','collected_indicators']][(df_sents['future_verbs'].isna() == False)]

In [23]:
df_sents[['spacy_sent_no_paren','future_verbs','collected_indicators']][(df_sents['i_displacement'] == 1) & (df_sents['future_verbs'].isna() == False)].

SyntaxError: invalid syntax (1764110125.py, line 1)

In [None]:
indicator = 'i_displacement'

#df_sents[['reported_date','source_original_text','spacy_sent_no_paren','collected_indicators']][(df_sents[indicator] == 1) & (df_sents['future_verbs'].isna() == False)].sort_values(by='reported_date').to_excel(f"c://temp//{indicator}_future.xlsx")
#df_sents[['reported_date','source_original_text','spacy_sent_no_paren','collected_indicators']][(df_sents[indicator] == 1)]
df_sents.to_excel("c://temp//all.xlsx")

In [None]:
def get_indicators(df):
    inds=[]
    for c in df.columns:
        if c[0:2] == 'i_':
            if df[c].tolist()[0] == 1:
                inds.append(c)

    return inds

def get_verb_tense_indicator(doc):
    for token in doc:
        print(f"{token.lemma_} -- {token.pos_} -- {token.morph}")


In [None]:
## test
df_focus = df_sents.sample(10)

for index, row in df_focus[['spacy_sent_no_paren','collected_indicators']].iterrows():
    print(row[0])
    print(row[1])
    print()    

In [30]:
df_sents.sample()

Unnamed: 0,record_type,source_url,glide_id,idx_para,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date,para_id,non_parenthetical_text,spacy_para_no_paren,spacy_sent_no_paren,idx_sent,identified_gpes,identified_country,identified_adm_01,identified_adm_02,identified_adm_chain,lower_lemmas,i_people,i_killed,i_injured,i_damage,i_infrastructure,i_cva,i_wash,i_shelter,i_food,i_logistic,i_health,i_gender_pss,i_protection,i_response,i_other_infrastructure,i_money,i_other,i_problem,i_demand_side,i_supply_side,i_tense_future,i_assessments,i_commodity_market,i_displacement,i_authority,i_statement_certainty,i_severity,i_change_increase,i_change_decrease,i_change_steady,i_geography,i_violence,i_count,svot,future_verbs,collected_indicators,key_terms,guid_sent
5087,situation report,https://api.reliefweb.int/v1/reports/3934055,EQ-2023-000015-TUR,2,Türkiye,"Welthungerhilfe – Earthquake Situation Report,...",coordination; education; food and nutrition; h...,People in north-west Syria continue to be trap...,https://reliefweb.int/attachments/e3d8289e-b18...,People in northwest Syria continue to be trapp...,WHH,2023-02-15,rwsitrep_https://reliefweb.int/attachments/e3d...,People in northwest Syria continue to be trapp...,"(People, in, northwest, Syria, continue, to, b...","(Only, 5, of, affected, areas, could, be, cove...",4,[],,,,,"[only, 5, of, affect, area, could, be, cover, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[([5], [could, be, covered], [rescue, teams])]",,"[i_response, i_count]","[(affected, areas), (rescue, teams)]",057f83150f11402f9e47e35369218cea


In [None]:
df_focus  = df_sents.sample(1)

s = df_focus['spacy_sent_no_paren'].tolist()[0]
idx = df_focus['spacy_sent_no_paren'].index
print(idx)
print()
print(re.sub("\n", " ", s.text))
print(get_indicators(df_focus))
print()

#get_verb_tense_indicator(s)


In [None]:
# causal factors

"due to" -- "ADP ADP"

In [None]:
text = 'The greatest increases in population densities were in Mersin , Niğde , and Adana '
doc = nlp(text)
from spacy import displacy
displacy.render(doc, style='ent')

In [None]:
def get_clean_date(date):
    date_object = datetime.strptime(date, "%d %b %Y")
    iso_date = date_object.date().isoformat()

In [None]:
#indicators

In [None]:
# Restructure the dictionary to create rows from the lists
restructured_dict = {
    'indicator': [],
    'word': []
}

for category, items in indicators.items():
    for item in items:
        restructured_dict['indicator'].append(category)
        restructured_dict['word'].append(item)

# Convert the restructured dictionary to a DataFrame
df_inds = pd.DataFrame.from_dict(restructured_dict)



#df_inds = pd.DataFrame.from_dict(indicators, orient='index')
df_inds


In [None]:
output_file = "D://projects//pythonProject//streamlit_surveyor//data//indicator_words.xlsx"
df_inds.to_excel(output_file, index=False)



In [23]:
df_sents[df_sents

Unnamed: 0,record_type,source_url,glide_id,idx_para,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date,para_id,non_parenthetical_text,spacy_para_no_paren,spacy_sent_no_paren,guid_sent,idx_sent,identified_gpes,identified_country,identified_adm_01,identified_adm_02,identified_adm_chain,lower_lemmas,i_people,i_killed,i_injured,i_damage,i_infrastructure,i_cva,i_wash,i_shelter,i_food,i_logistic,i_health,i_gender_pss,i_protection,i_response,i_other_infrastructure,i_money,i_other,i_problem,i_demand_side,i_supply_side,i_tense_future,i_assessments,i_commodity_market,i_displacement,i_authority,i_statement_certainty,i_severity,i_change_increase,i_change_decrease,i_change_steady,i_geography,i_violence,i_count,verbs,svot,future_verbs,collected_indicators,key_terms
0,situation report,https://api.reliefweb.int/v1/reports/3961053,EQ-2023-000015-TUR,0,Türkiye,UNICEF Türkiye Humanitarian Situation Report N...,contributions; coordination; education; food a...,Highlights,https://reliefweb.int/attachments/00f4be8b-507...,Highlights,UNICEF,2023-05-11,rwsitrep_https://reliefweb.int/attachments/00f...,Highlights,(Highlights),"(No, content, to, return, .)",ad51c20264724e79bd475b30f6677fca,0,,,,,,"[no, content, to, return, .]",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,[return],[],,"[i_people, i_count]",[(content)]
1,situation report,https://api.reliefweb.int/v1/reports/3961053,EQ-2023-000015-TUR,1,Türkiye,UNICEF Türkiye Humanitarian Situation Report N...,contributions; coordination; education; food a...,- Around 2.4 million people are living in form...,https://reliefweb.int/attachments/00f4be8b-507...,Around 2.4 million people are living in formal...,UNICEF,2023-05-11,rwsitrep_https://reliefweb.int/attachments/00f...,Around 2.4 million people are living in formal...,"(Around, 2.4, million, people, are, living, in...","(Around, 2.4, million, people, are, living, in...",6a055beb46d54e9dad0d931c42b38601,0,[],,,,,"[around, 2.4, million, people, be, live, in, f...",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,[living],[],,"[i_people, i_count]","[(Around, 2.4, million, people), (formal, site..."
2,situation report,https://api.reliefweb.int/v1/reports/3961053,EQ-2023-000015-TUR,1,Türkiye,UNICEF Türkiye Humanitarian Situation Report N...,contributions; coordination; education; food a...,- Around 2.4 million people are living in form...,https://reliefweb.int/attachments/00f4be8b-507...,Around 2.4 million people are living in formal...,UNICEF,2023-05-11,rwsitrep_https://reliefweb.int/attachments/00f...,Around 2.4 million people are living in formal...,"(Around, 2.4, million, people, are, living, in...","(The, key, needs, in, informal, sites, continu...",802f35514d074ed39e95f3d348aabd39,1,,,,,,"[the, key, need, in, informal, site, continue,...",1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,6,[continue],"[([needs], [continue], [to, be, access, to, ad...",,"[i_people, i_wash, i_gender_pss, i_demand_side...","[(key, needs), (informal, sites), (access), (a..."
3,situation report,https://api.reliefweb.int/v1/reports/3961053,EQ-2023-000015-TUR,1,Türkiye,UNICEF Türkiye Humanitarian Situation Report N...,contributions; coordination; education; food a...,- Around 2.4 million people are living in form...,https://reliefweb.int/attachments/00f4be8b-507...,Around 2.4 million people are living in formal...,UNICEF,2023-05-11,rwsitrep_https://reliefweb.int/attachments/00f...,Around 2.4 million people are living in formal...,"(Around, 2.4, million, people, are, living, in...","(Limited, support, for, people, with, disabili...",b994c33d727e45299a824ecab5a2d4b2,2,,,,,,"[limited, support, for, people, with, disabili...",1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,[],[],,"[i_people, i_logistic, i_protection, i_response]","[(Limited, support), (people), (disabilities),..."
4,situation report,https://api.reliefweb.int/v1/reports/3961053,EQ-2023-000015-TUR,10,Türkiye,UNICEF Türkiye Humanitarian Situation Report N...,contributions; coordination; education; food a...,Under the UNICEF Türkiye Earthquake Response H...,https://reliefweb.int/attachments/00f4be8b-507...,Under the UNICEF Türkiye Earthquake Response H...,UNICEF,2023-05-11,rwsitrep_https://reliefweb.int/attachments/00f...,Under the UNICEF Türkiye Earthquake Response H...,"(Under, the, UNICEF, Türkiye, Earthquake, Resp...","(Under, the, UNICEF, Türkiye, Earthquake, Resp...",ab13379106614af888e6ddbe7c570272,0,[],,,,,"[under, the, unicef, türkiye, earthquake, resp...",1,0,0,0,0,0,0,0,0,1,1,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,10,"[including, enabled, affected]","[([States, Bureau, Central, Emergency, Respons...",,"[i_people, i_logistic, i_health, i_gender_pss,...","[(UNICEF, Türkiye, Earthquake, Response, Human..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5609,situation report,https://api.reliefweb.int/v1/reports/3934356,EQ-2023-000015-TUR,6,Syria,UNRWA Syria Field Office Earthquake status upd...,food and nutrition; health; shelter and non-fo...,UNRWA continues to provide essential NFIs and ...,https://reliefweb.int/attachments/ff6e84d6-c23...,UNRWA continues to provide essential NFIs and ...,UNRWA,2023-02-15,rwsitrep_https://reliefweb.int/attachments/ff6...,UNRWA continues to provide essential NFIs and ...,"(UNRWA, continues, to, provide, essential, NFI...","(About, 286, Palestine, refugee, families, are...",ec828bb76e8945f995686a169f143ac8,2,[(Aleppo)],[Syrian Arab Republic],[Aleppo],,"[[Syrian Arab Republic, Aleppo]]","[about, 286, palestine, refugee, family, be, r...",1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,4,"[reported, displaced, staying]","[([Palestine, refugee, families], [are, report...",,"[i_people, i_shelter, i_displacement, i_change...","[(About, 286, Palestine, refugee, families), (..."
5610,situation report,https://api.reliefweb.int/v1/reports/3934356,EQ-2023-000015-TUR,7,Syria,UNRWA Syria Field Office Earthquake status upd...,food and nutrition; health; shelter and non-fo...,Students attending UNRWA schools in the north ...,https://reliefweb.int/attachments/ff6e84d6-c23...,Students attending UNRWA schools in the north ...,UNRWA,2023-02-15,rwsitrep_https://reliefweb.int/attachments/ff6...,Students attending UNRWA schools in the north ...,"(Students, attending, UNRWA, schools, in, the,...","(Students, attending, UNRWA, schools, in, the,...",05f1e0b434d4477e9adad1c6bce3482b,0,[(Hama)],[Syrian Arab Republic],[Hama],,"[[Syrian Arab Republic, Hama]]","[student, attend, unrwa, school, in, the, nort...",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,5,"[attending, receiving, participating]","[([Students], [are, receiving], [group, counse...",,"[i_infrastructure, i_demand_side, i_supply_sid...","[(Students), (UNRWA, schools), (north), (group..."
5611,situation report,https://api.reliefweb.int/v1/reports/3934356,EQ-2023-000015-TUR,7,Syria,UNRWA Syria Field Office Earthquake status upd...,food and nutrition; health; shelter and non-fo...,Students attending UNRWA schools in the north ...,https://reliefweb.int/attachments/ff6e84d6-c23...,Students attending UNRWA schools in the north ...,UNRWA,2023-02-15,rwsitrep_https://reliefweb.int/attachments/ff6...,Students attending UNRWA schools in the north ...,"(Students, attending, UNRWA, schools, in, the,...","(Recreational, activities, for, 65, children, ...",1c60d4934fff446ea0b781cec72b2339,1,[(Hama)],[Syrian Arab Republic],[Hama],,"[[Syrian Arab Republic, Hama]]","[recreational, activity, for, 65, child, be, a...",1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,5,[provided],[],,"[i_people, i_gender_pss, i_response, i_supply_...","[(Recreational, activities), (65, children), (..."
5612,situation report,https://api.reliefweb.int/v1/reports/3934356,EQ-2023-000015-TUR,8,Syria,UNRWA Syria Field Office Earthquake status upd...,food and nutrition; health; shelter and non-fo...,Needs update,https://reliefweb.int/attachments/ff6e84d6-c23...,Needs update,UNRWA,2023-02-15,rwsitrep_https://reliefweb.int/attachments/ff6...,Needs update,"(Needs, update)","(No, content, to, return, .)",8756cee7b4bb44c69c1a1ffcffb51671,0,,,,,,"[no, content, to, return, .]",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,[return],[],,"[i_people, i_count]",[(content)]
