## New 2/26/2024

## This serves as the core logic to extracting content from reliefweb

### Once this is finished, it replaces reliefweb_situation_reports

## Process Steps
1) get the last 500 disaster summaries from reliefweb
2) process the json and build a dataframe structured such that:
   1) each paragraph has its own row
   2) each identified reference url in that paragraph is parsed out along with other metadata
3) write resulting dataframe to postgres db - note this completely replaces the previously-existing table

## Cautions
Occasionally the reliefweb api call will return an error - to the effect that there's some bad chunk or something.
I've left that unhandled. Conflicting evidence as to whether it's an intermittent issue, maybe cause by rate limiting,
 or an issue related to a specific summary's format. Both seem to have been the case at various times.
 

In [7]:
import requests
import pandas as pd
import uuid
import json
import os
import uuid
import re
import importlib

from datetime import datetime

import sys
sys.path.append('utilities')
import basic_utilities as utils

In [8]:
# key config vars
api_endpoint = 'https://api.reliefweb.int/v1/reports?appname=amcross'


output_format = 'excel' #other option is 'postgres'
output_name = 'rw_disaster_situation_reports'


## If you plan to connect to a database
db_conf ={
    'host':"xxx",
    'port':'5432',
    'database':"postgres",
    'user':"postgres",
    'password':"xxx"
}



In [9]:
def get_rw_situation_reports(limit=500):
    

    #set a high limit for latest in case the job doesn't run for a long time
    params = {
        'appname': 'amcross','profile': 'full','preset': 'latest','limit': limit
        ,'query[fields][]':'format.name','query[value]':'Situation Report'
       # ,'filter[field]': "id",'filter[value][from]':max_id
    }

    

    # Make the API request
    response = requests.get(api_endpoint, params=params)
    
    # Check the status of the response
    if response.status_code == 200:
        # Parse and use the response data (in JSON format)
        data = response.json()
        return data['data']
        # for disasters if we don't return everything, can't get to the original api call
        #return data
    
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None



         

In [10]:
def extract_themes(j):
    #must pass in after levelling json up to j['fields']
    themes = j.get('theme')
    if themes is None:
        return None
        
    ts =[]
    for theme in themes:
        ts.append(theme['name'].lower())
    return '; '.join(ts)

def parse_json(j):

    
    try:
        reference_url = j['href']
            
        j = j['fields']
        #print(j)
        
        glide_id = None
        disaster = j.get('disaster')
        if disaster:
            glide_id = disaster[0]['glide']
    
        
        rec_id = j['id']
        title = j['title']
        original_text = j['body']
        link_to_doc = j['url_alias']
        file_url = j['file'][0]['url']
        primary_country_iso3 = j['primary_country']['iso3']
        primary_country = j['primary_country']['shortname']
        author_org = j['source'][0]['shortname']
        report_date = j['date']['original']
        themes = extract_themes(j)

        # 2 newlines will not necessarily reliably break out every paragraph
        # but it's better than the alternative where some sentences will get cut in half
        original_text_list = original_text.split("\n\n")
        idx_para = 0
        for o in original_text_list:
            if len(o) > 3:
                row = ['situation report',reference_url,glide_id,rec_id,idx_para,primary_country,title,themes,o,file_url,o,author_org,report_date]
                df_reliefweb_situation_report.loc[len(df_reliefweb_situation_report)] = row
                idx_para += 1
    except:
        pass



In [11]:
#prep receiving df
df_reliefweb_situation_report = pd.DataFrame(columns = ['record_type','source_url','glide_id','doc_id','idx_para','source_level_country','source_title','source_desc',
                                                        'source_original_text','reference_url','text','authoring_org','reported_date'])

situation_reports = get_rw_situation_reports(limit=500)

for situation_report in situation_reports:
    parse_json(situation_report)



## Data Persistence

In [12]:
if output_format == 'excel':
    outfile = utils.write_to_excel(df_reliefweb_situation_report, filename=output_name)
    print(f"df written to: {outfile}")
elif output_format == 'postgres':
    persist_to_postgres(db_conf, output_name, df_reliefweb_situation_report)
else:
    print(f"unknown persistence type: {output_format}")

df written to: stored_data\rw_disaster_situation_reports_8a4af19ed04345f2b41d8de4adb98d4e.xlsx
