In [61]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
import re

In [62]:
relief_web_urls = ['https://reliefweb.int/disaster/eq-2023-000015-tur'
                   ,'https://reliefweb.int/disaster/dr-2021-000022-afg'
                   ,'https://reliefweb.int/disaster/ff-2023-000133-afg'
                   ,'https://reliefweb.int/disaster/eq-2023-000184-afg'
                  ,'https://reliefweb.int/disaster/fl-2023-000040-tur']


In [63]:
#res = requests.get(relief_web_urls[4])
#html = res.text
#soup = BeautifulSoup(html)


<meta property="og:title" content="Afghanistan: Earthquakes - Oct 2023" />\n
<meta property="og:description" content="Humanitarian situation reports, response plans, news, analyses, evaluations, assessments, maps, infographics and more on Afghanistan: Earthquakes - Oct 2023" />


In [64]:
relief_web_fields = {
    'Disaster Description' : ['h2',{'class': 'cd-block-title rw-entity-text__title', 'id': 'overview-title'}]
    ,'glide' : ['dd',{'class':'rw-entity-meta__tag-value rw-entity-meta__tag-value--glide rw-entity-meta__tag-value--simple rw-entity-meta__tag-value--last'}]

}

In [65]:
def get_discrete_tag_text(soup, tag, attributes={}):
    results = soup.find_all(tag, attributes)
    r_list=[]
    for r in results:
        r_list.append(r.text.strip())
    return r_list

 
tag = 'dd'
attributes = {}
attributes = {'class':'rw-entity-meta__tag-value rw-entity-meta__tag-value--glide rw-entity-meta__tag-value--simple rw-entity-meta__tag-value--last'}

get_discrete_tag_text(soup, relief_web_fields['glide'][0], attributes = relief_web_fields['glide'][1])




#<dd class="rw-entity-meta__tag-value--status--ongoing rw-entity-meta__tag-value rw-entity-meta__tag-value--status rw-entity-meta__tag-value--simple">

['FL-2023-000040-TUR']

In [66]:
# Find all <a> tags (links) in the HTML
links = soup.find_all('a')

#this gets the desc
target_tag = soup.find('h2', {'class': 'cd-block-title rw-entity-text__title', 'id': 'overview-title'})

def extract_status(soup):
    status = 'unknown'
    for tag in soup.find_all('article', attrs={'class':'taxonomy-term taxonomy-term--disaster taxonomy-term--full taxonomy-term--disaster--full rw-page--sectioned rw-article rw-article--disaster'}):
        if tag.get('data-disaster-status'):
            status= tag.get('data-disaster-status')

    return status


def extract_metadata(soup):
    #get title
    meta_tags = soup.find_all('meta')  # Find all <meta> tags in the HTML content

    # Extract content from specific <meta> tags using their property attribute
    for tag in meta_tags:
        if tag.get('property') == 'og:title':
            title = tag.get('content')
            #print(f"og:title content: {title}")
        elif tag.get('property') == 'og:description':
            description = tag.get('content')
            #print(f"og:description content: {description}")

    return title, description
    
title, description = extract_metadata(soup)
#print(title, description)

def extract_affected_countries(soup):
    # Find all <h3> tags with class 'rw-river-article__title'
    countries_section = soup.find('section', id='countries')
    
    # Find all <h3> tags with class 'rw-river-article__title' within the 'countries' section
    country_titles = countries_section.find_all('h3', class_='rw-river-article__title')
    
    countries = []
    for title in country_titles:
        country_name = title.text.strip()
        countries.append(country_name)
    
    return countries

x = extract_affected_countries(soup)
print(x)

def extract_content(soup):
    #this gets the full text content
    target_tag = soup.find('div', {'class': 'rw-entity-text__content', 'id': 'overview-content'})
    
    #within the text content, take it paragraphy by paragraph
    if target_tag:
        content=[]
        paras = target_tag.find_all('p')
        for p in paras:
            links = p.find_all('a')  # Find all <a> tags within each <p> tag
            l=[]
            for link in links:
                href = link.get('href')  # Get the 'href' attribute from each <a> tag

                l.append(href)

            #if there are no urls, set to None so fillna can deal with it later
            if len(l) == 0:
                l = None
            content.append([p.text.strip(),l])
    return content


#x = extract_content(soup)
#x[1]

['Türkiye']


In [67]:
#load text to df
df_reliefweb_disaster_summary = pd.DataFrame(columns = ['record_type','source_url','glide_id','source_level_country','source_title','source_desc','source_original_text','reference_url','disaster_status'])
for url in relief_web_urls:
    print (url)
    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    glide_id = get_discrete_tag_text(soup, relief_web_fields['glide'][0], attributes = relief_web_fields['glide'][1])[0]

    status = extract_status(soup)
    title, description = extract_metadata(soup)
    countries_affected = extract_affected_countries(soup)
    content = extract_content(soup)

    for c in content:
        row = ['disaster summary',url,glide_id,countries_affected,title,description,]
        row.extend(c)
        row.append(status)
        df_reliefweb_disaster_summary.loc[len(df_reliefweb_disaster_summary)] = row


    
    
    

https://reliefweb.int/disaster/eq-2023-000015-tur
https://reliefweb.int/disaster/dr-2021-000022-afg
https://reliefweb.int/disaster/ff-2023-000133-afg
https://reliefweb.int/disaster/eq-2023-000184-afg
https://reliefweb.int/disaster/fl-2023-000040-tur


In [68]:

def extract_reliefweb_summary_reference(text):
    # reliefweb disaster summary text ends with a parenthetical reference to a detailed source.
    # this function finds and extracts them
    
    #find (OHCA, 16 Feb 2023) - 
    #    but within that, find specifically 'OHCA' and '16 Feb 2023'
    #    groups 2 and 3 respectively
    source_and_date = re.search(r'\((([\w\s]+), (\d+ \w+ \d{4}))\)$', text)
    
    if source_and_date:
        source = source_and_date.group(2)
        reported_date = source_and_date.group(3)
    
        #now that we have the metadata in hand, remove if from the source
        text = text[:source_and_date.span()[0]].strip()
        
    else:
        source = None
        reported_date = None
    
    return pd.Series({'text':text, 'source':source, 'reported_date':reported_date})


extract_reliefweb_summary_reference('is expected to increase in the coming days/weeks. (OCHA asdf, 16 Feb 2023)')

text             is expected to increase in the coming days/weeks.
source                                                   OCHA asdf
reported_date                                          16 Feb 2023
dtype: object

In [69]:
df_reliefweb_disaster_summary[['text','authoring_org','reported_date']] = df_reliefweb_disaster_summary['source_original_text'].apply(extract_reliefweb_summary_reference)
df_reliefweb_disaster_summary[['reference_url','authoring_org','reported_date']] = df_reliefweb_disaster_summary[['reference_url','authoring_org','reported_date']].bfill()


In [70]:
df_reliefweb_disaster_summary.to_csv("c://temp//foo.csv", encoding='utf-8-sig', index=False)

In [72]:
set(df_reliefweb_disaster_summary['disaster_status'].tolist())

{'ongoing', 'past'}

In [73]:
df_reliefweb_disaster_summary

Unnamed: 0,record_type,source_url,glide_id,source_level_country,source_title,source_desc,source_original_text,reference_url,disaster_status,text,authoring_org,reported_date
0,disaster summary,https://reliefweb.int/disaster/eq-2023-000015-tur,EQ-2023-000015-TUR,"[Syrian Arab Republic, Türkiye]",Türkiye/Syria: Earthquakes - Feb 2023,"Humanitarian situation reports, response plans...","On 6 February, a 7.7 magnitude earthquake stru...",[https://reliefweb.int/node/3930732/],ongoing,"On 6 February, a 7.7 magnitude earthquake stru...",OCHA,6 Feb 2023
1,disaster summary,https://reliefweb.int/disaster/eq-2023-000015-tur,EQ-2023-000015-TUR,"[Syrian Arab Republic, Türkiye]",Türkiye/Syria: Earthquakes - Feb 2023,"Humanitarian situation reports, response plans...",UN and partners are preparing the first cross-...,[https://reliefweb.int/node/3930732/],ongoing,UN and partners are preparing the first cross-...,OCHA,8 Feb 2023
2,disaster summary,https://reliefweb.int/disaster/eq-2023-000015-tur,EQ-2023-000015-TUR,"[Syrian Arab Republic, Türkiye]",Türkiye/Syria: Earthquakes - Feb 2023,"Humanitarian situation reports, response plans...","More than 4,400 deaths and 8,100 injuries have...",[https://reliefweb.int/node/3934681/],ongoing,"More than 4,400 deaths and 8,100 injuries have...",OCHA,17 Feb 2023
3,disaster summary,https://reliefweb.int/disaster/eq-2023-000015-tur,EQ-2023-000015-TUR,"[Syrian Arab Republic, Türkiye]",Türkiye/Syria: Earthquakes - Feb 2023,"Humanitarian situation reports, response plans...",The UN and humanitarian partners launched a Fl...,[https://reliefweb.int/node/3933829/],ongoing,The UN and humanitarian partners launched a Fl...,OCHA,15 Feb 2023
4,disaster summary,https://reliefweb.int/disaster/eq-2023-000015-tur,EQ-2023-000015-TUR,"[Syrian Arab Republic, Türkiye]",Türkiye/Syria: Earthquakes - Feb 2023,"Humanitarian situation reports, response plans...","In Türkiye, over 36,100 people were killed and...",[https://reliefweb.int/node/3934516/],ongoing,"In Türkiye, over 36,100 people were killed and...",OCHA,16 Feb 2023
...,...,...,...,...,...,...,...,...,...,...,...,...
83,disaster summary,https://reliefweb.int/disaster/fl-2023-000040-tur,FL-2023-000040-TUR,[Türkiye],Türkiye: Floods - Mar 2023,"Humanitarian situation reports, response plans...",The damage of the flooding and what is left be...,[https://reliefweb.int/node/3950212/],past,The damage of the flooding and what is left be...,Turkish Red Crescent Society,04 Apr 2023
84,disaster summary,https://reliefweb.int/disaster/fl-2023-000040-tur,FL-2023-000040-TUR,[Türkiye],Türkiye: Floods - Mar 2023,"Humanitarian situation reports, response plans...",Heavy rainfall has been affecting northern Tür...,[https://reliefweb.int/node/3968566/],past,Heavy rainfall has been affecting northern Tür...,ECHO,06 June 2023
85,disaster summary,https://reliefweb.int/disaster/fl-2023-000040-tur,FL-2023-000040-TUR,[Türkiye],Türkiye: Floods - Mar 2023,"Humanitarian situation reports, response plans...",The Disaster and Emergency Management Presiden...,[https://reliefweb.int/node/3968566/],past,The Disaster and Emergency Management Presiden...,ECHO,06 June 2023
86,disaster summary,https://reliefweb.int/disaster/fl-2023-000040-tur,FL-2023-000040-TUR,[Türkiye],Türkiye: Floods - Mar 2023,"Humanitarian situation reports, response plans...",Heavy rainfall has been affecting the Black Se...,[https://reliefweb.int/node/3979656/],past,Heavy rainfall has been affecting the Black Se...,ECHO,13 July 2023
