In [1]:
import pandas as pd
import numpy as np

## 1) Merge, preprocess files:


In [2]:
organizations = pd.read_csv('data_secondary_tags/reliability/organization_organization.csv')
organizations.head(2)

Unnamed: 0,id,title,short_name,long_name,logo_id,organization_type_id,url,created_at,created_by_id,modified_at,modified_by_id,verified,parent_id,relief_web_id
0,6123,ncdc,ncdc,ncdc,,,,2020-03-02 08:17:34.823758+00,,2020-03-02 08:17:34.823785+00,,False,,
1,4,Terre Solidali Onlus,Terre Solidali,Terre Solidali Onlus,,,http://www.terresolidali.org/,2019-04-29 17:58:56.821129+00,,2019-04-29 17:58:57.044937+00,,True,,


In [3]:
data_source = pd.read_csv('data_secondary_tags/reliability/leads.csv')
#sorted(data_source.source_id.unique())

In [4]:
data_source[data_source.source_id==2]

Unnamed: 0,id,created_at,modified_at,title,source_raw,confidentiality,status,published_on,text,url,...,modified_by_id,project_id,attachment_id,source_type,lead_group_id,client_id,author_raw,author_id,source_id,priority
9758,45614,2020-12-15 18:58:42.861631+00,2021-04-15 20:28:20.112782+00,COVID-19 and Disruptions to Vulnerable Rural L...,,unprotected,validated,2020-08-01,,https://www.cgap.org/sites/default/files/publi...,...,3009,2333,,website,,,,,2.0,100


In [5]:
pd.merge(
    organizations[['id','short_name', 'long_name']].rename(columns={'id':'source_id'}),
    data_source,
    how='outer',
    on='source_id',
).to_csv('data_secondary_tags/reliability/data_verification.csv')

In [6]:

data_source = pd.read_csv('data_secondary_tags/reliability/data_verification.csv',index_col=0)\
                    .drop(columns=['created_at',
                                   'modified_at',
                                   'confidentiality',
                                   'status',
                                   'published_on',
                                   'text',
                                   'created_by_id',
                                   'modified_by_id',
                                   'priority',
                                   'client_id',
                                   'author_id',
                                   'attachment_id',
                                   'project_id',
                                   'url'])

data_reliability = pd.read_csv('data_secondary_tags/reliability/reliabillity_tags.csv')\
                    .drop(columns=['Unnamed: 0',
                                    'Unnamed: 0.1',
                                   'analysis_framework_id',
                                   'project_id',
                                   'title',
                                   'exportable_id'])


In [7]:
merged_reliability = pd.merge(
                        data_reliability,
                        data_source,
                        how="outer",
                        left_on='lead_id',
                        right_on='id').drop(columns=['id',
                                                     'title',
                                                     'excerpt'])

In [8]:
merged_reliability.shape

(148890, 11)

In [9]:
merged_reliability.head(2)

Unnamed: 0,lead_id,entry_id,tag_value,source_id,short_name,long_name,source_raw,website,source_type,lead_group_id,author_raw
0,10607.0,25639.0,Usually reliable,,,,reliefweb,reliefweb.int,website,,
1,10607.0,25691.0,Usually reliable,,,,reliefweb,reliefweb.int,website,,


* Organisations have long and short name, sometimes one only
* Create one column `name` to merge short and long names
* Create one columns `postprocessed_name` to postprocess `name` column: when `name` column only has long or short name, add the missing name

In [10]:
def get_full_name (row):
    """
    Get full name, using short name, long name of organizations df
    """
    short_str = row.short_name
    long_str = row.long_name
    
    bool_short_name = pd.isna(short_str)
    bool_long_name = pd.isna(long_str)
    
    if bool_short_name and bool_long_name:
        return np.nan
    if bool_long_name:
        return short_str
    if bool_short_name:
        return long_str
    if short_str.upper() in long_str.upper():
        return long_str
    return row.short_name + ' / ' + row.long_name


In [11]:
merged_reliability['name'] = merged_reliability.apply(lambda x: get_full_name(x), axis=1)

In [12]:
bool_two_items = merged_reliability['name'].apply(lambda x: len(str(x).split('/'))>1 )
list_two_items = merged_reliability[bool_two_items].name.unique()


In [13]:
def post_process_names (row, list_tow_items=list_two_items):
    """
    if there is only short name or long name -> get full name (long and short names)
    """
    if pd.isna(row) :
        return row
    
    if len(str(row).split('/'))<2:
        for nam in list_tow_items:
            if str(row).upper() in ''.join(nam.split(' ')).upper():
                return nam
    return row

In [14]:
merged_reliability['postprocessed_name'] = merged_reliability.name.apply(lambda x: post_process_names(x))

In [15]:
merged_reliability = merged_reliability.drop(columns=['short_name', 'long_name'])

In [16]:
merged_reliability = merged_reliability[~merged_reliability.tag_value.isna()]\
                            .drop(columns=['lead_id', 'source_id', 'name'])

In [17]:
merged_reliability.head(2)

Unnamed: 0,entry_id,tag_value,source_raw,website,source_type,lead_group_id,author_raw,postprocessed_name
0,25639.0,Usually reliable,reliefweb,reliefweb.int,website,,,
1,25691.0,Usually reliable,reliefweb,reliefweb.int,website,,,


In [18]:
htps_data = merged_reliability[merged_reliability.website.str.contains('/'.upper(), na=False)]

## 2) Website data:
* I chose to omit the website `reliefweb` because it contains many different sources, it is not a source itself

In [19]:
def omit_https(url:str)->str:
    return url.split('/')[-1]

def extract_core_name (name:str)->str:
    """
    Extract core name from website url
    """
    split_list = name.split('.')
    
    if len(split_list)<=2:
        return omit_https(split_list[0])
    if split_list[1]=='com':
        return omit_https(split_list[0])
    else:
        return split_list[1]

In [20]:
website_data = merged_reliability[merged_reliability.source_type=='website']\
                    .drop(columns=['source_type', 'lead_group_id'])\
                    
website_data = website_data[(~website_data.website.isna()) & \
                            (website_data.website.apply(lambda x: 'reliefweb' not in x))]

website_data['core_name'] = website_data.website.apply(lambda x: extract_core_name(x))
website_data.head()

Unnamed: 0,entry_id,tag_value,source_raw,website,author_raw,postprocessed_name,core_name
21,102051.0,Usually reliable,,https://www.dailysabah.com/turkey/turkey-repor...,,DS / Daily Sabah,dailysabah
22,102050.0,Usually reliable,,https://www.dailysabah.com/turkey/turkey-repor...,,DS / Daily Sabah,dailysabah
23,102052.0,Usually reliable,,https://www.dailysabah.com/turkey/turkey-repor...,,DS / Daily Sabah,dailysabah
24,102053.0,Usually reliable,,https://www.dailysabah.com/turkey/turkey-repor...,,DS / Daily Sabah,dailysabah
272,96087.0,Usually reliable,,displacement.iom.int,,IOM / International Organization for Migration,iom


In [21]:
tmp_df = website_data[['core_name', 'postprocessed_name']].dropna()\
        .groupby(['core_name'])['postprocessed_name'].agg(pd.Series.mode).to_frame().reset_index()

In [22]:
website_to_name_dict = dict(zip(tmp_df.core_name, tmp_df.postprocessed_name))

In [23]:
website_to_name_dict

{'1lockers': '1lockers',
 '2001': '2001 Live',
 '237actu': '237actu',
 '7sur7': '7sur7',
 'El espectador': 'El Espectador',
 'FUPAD': 'FUPAD / Fundación Panamericana para el Desarrollo -Colombia',
 'UNFPA': 'UNFPA / United Nations Population Fund',
 'WHO': 'WHO / World Health Organization',
 'aa': 'AA / Andalou Agency',
 'aawsat': 'Asharq Al-Awsat',
 'abc': 'ABC / Australian Broadcasting Corporation',
 'acaps': 'ACAPS / Assessment Capacities Project',
 'acento': 'acento',
 'acleddata': 'acleddata',
 'acnur': 'UNHCR / United Nations High Commissioner for Refugees',
 'acpcongo': 'acpcongo',
 'actalliance': 'DIAL / Digital Impact Alliance',
 'acted': 'ACTED / Agency for Technical Cooperation and Development',
 'actionagainsthunger': 'ACF / Action Against Hunger',
 'actu24': 'actu24',
 'actu30': 'actu30',
 'actualite': 'Actualite.cd',
 'acturdc': array(['Actu RDC', 'acturdc'], dtype=object),
 'acu-sy': 'ACU / \u200eAssistance Coordination Unit',
 'adiac-congo': 'adiac-congo',
 'afdb': 'AfD

## 2) NON website data
* `source_raw`: data imported from taggers
* `postprocessed_name`: data imported and preprocessed from `organisations` dataframe

* Merge these two columns to create one column that is best for 

In [24]:
non_website_data = merged_reliability[merged_reliability.source_type!='website']\
                    .drop(columns=['lead_group_id', 'source_type'])\
        

In [25]:
non_website_data.head()

Unnamed: 0,entry_id,tag_value,source_raw,website,author_raw,postprocessed_name
255,33155.0,Usually reliable,Philippines Red Cross,,,
256,33156.0,Usually reliable,Philippines Red Cross,,,
257,33150.0,Usually reliable,Philippines Red Cross,,,
258,33141.0,Usually reliable,Philippines Red Cross,,,
259,33153.0,Usually reliable,Philippines Red Cross,,,


In [26]:
def strip_string(x:str)->str:
    return ''.join(x.split(' ')).upper()

In [27]:
def get_names(x:pd.Series)->str:
    """
    merge postprocessed_name and source_raw
    """
    post_name = x.postprocessed_name
    raw_name = x.source_raw
    if pd.isna(raw_name) and pd.isna(post_name):
        return 'UNKNOWN'
    
    if pd.isna(raw_name):
        return post_name
    
    if pd.isna(post_name):
        return raw_name
    
    return post_name 

In [28]:
non_website_data['postprocessed_name'] =\
    non_website_data.apply(lambda x: get_names(x),axis=1)

In [29]:
non_website_data.head(2)

Unnamed: 0,entry_id,tag_value,source_raw,website,author_raw,postprocessed_name
255,33155.0,Usually reliable,Philippines Red Cross,,,Philippines Red Cross
256,33156.0,Usually reliable,Philippines Red Cross,,,Philippines Red Cross


**Remaining work**
* Cleaning on `postprocessed_name` for `website_data` and `non_website_data`
* concatenate `website_data` and `non_website_data` on the `postprocessed_name`
* Generating dict $\{postprocessed_-name: reliability\}$

In [30]:
whole_df = pd.concat([website_data[['postprocessed_name', 'tag_value']],
                      non_website_data[['postprocessed_name', 'tag_value']]])

In [31]:
name_to_reliability_dict = dict(zip(whole_df.postprocessed_name, whole_df.tag_value))


In [32]:
len(name_to_reliability_dict)

999