In [21]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
import re

## 1) Merge, preprocess files:


In [2]:
organizations = pd.read_csv('reliability/organizations.csv').rename(
    columns={'id':'source_id', 'title':'organization_title'}
)[['source_id', 'organization_title']]
organizations.head()

Unnamed: 0,source_id,organization_title
0,6123,ncdc
1,4,Terre Solidali Onlus
2,6204,vecer
3,6369,levidia
4,5228,newsbrief


In [3]:
data_source = pd.read_csv('reliability/leads.csv')[[
    'id', 'title', 'source_raw',
       'source_type', 'lead_group_id', 'author_raw', 'author_id',
       'source_id'
]].rename(columns={'id':'lead_id'})
data_source.columns

Index(['lead_id', 'title', 'source_raw', 'source_type', 'lead_group_id',
       'author_raw', 'author_id', 'source_id'],
      dtype='object')

In [4]:
DATA_PATH = os.path.join(
    '..', '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)
full_data = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
full_data.columns

Index(['entry_id', 'excerpt', 'analysis_framework_id', 'lead_id', 'project_id',
       'verified', 'sectors', 'subpillars_2d', 'subpillars_1d', 'geo_location',
       'specific_needs_groups', 'severity', 'info_date', 'reliability',
       'affected_groups_level_0', 'affected_groups_level_1',
       'affected_groups_level_2', 'affected_groups_level_3', 'age', 'gender',
       'source_type', 'url', 'website', 'lang', 'translation_en',
       'translation_fr', 'translation_es'],
      dtype='object')

In [6]:
reliability_data = full_data[~full_data.reliability.isna()][[
    'analysis_framework_id', 'lead_id', 'project_id','reliability','verified',
    'source_type', 'url', 'website'
]]
reliability_data['reliability'] = reliability_data.reliability.apply(literal_eval)
reliability_data = reliability_data[reliability_data.reliability.apply(
    lambda x: len(x) == 1
)]
reliability_data

Unnamed: 0,analysis_framework_id,lead_id,project_id,reliability,verified,source_type,url,website
0,137.0,6334.0,322.0,[Usually reliable],False,website,https://reliefweb.int/sites/reliefweb.int/file...,reliefweb.int
1,1306.0,67488.0,2225.0,[Usually reliable],False,website,https://blogs.worldbank.org/education/free-pri...,blogs.worldbank.org
2,829.0,41125.0,1898.0,[Usually reliable],False,website,https://www.acaps.org/sites/acaps/files/key-do...,https://www.acaps.org
3,829.0,41081.0,1184.0,[Usually reliable],False,website,https://reliefweb.int/sites/reliefweb.int/file...,reliefweb.int
4,1306.0,67488.0,2225.0,[Usually reliable],False,website,https://blogs.worldbank.org/education/free-pri...,blogs.worldbank.org
...,...,...,...,...,...,...,...,...
157943,829.0,41882.0,1187.0,[Usually reliable],,website,https://redhum.org/documento/3680634,redhum.org
157944,829.0,41882.0,1187.0,[Usually reliable],,website,https://redhum.org/documento/3680634,redhum.org
157945,829.0,60143.0,2074.0,[Usually reliable],,website,https://reliefweb.int/sites/reliefweb.int/file...,reliefweb.int
157946,829.0,60058.0,1232.0,[Usually reliable],,website,https://reliefweb.int/sites/reliefweb.int/file...,reliefweb.int


In [7]:
reliability_to_score = {
    'Usually reliable': 0.75,
    'Fairly Reliable': 0.5,
    'Completely Reliable': 1,
    'Unreliable': 0,
    'Not Usually Reliable': 0.25
}

In [8]:
reliability_data['reliability_score'] = reliability_data.reliability.apply(
    lambda x: reliability_to_score[x[0]]
)

In [9]:
data_source[data_source.source_id==2]

Unnamed: 0,lead_id,title,source_raw,source_type,lead_group_id,author_raw,author_id,source_id
9758,45614,COVID-19 and Disruptions to Vulnerable Rural L...,,website,,,,2.0


In [10]:
organizations_source_merged = pd.merge(
    organizations,
    data_source,
    how='outer',
    on='source_id',
)

In [11]:
organizations_source_merged.columns

Index(['source_id', 'organization_title', 'lead_id', 'title', 'source_raw',
       'source_type', 'lead_group_id', 'author_raw', 'author_id'],
      dtype='object')

In [12]:
merged_reliability = pd.merge(
                        reliability_data,
                        organizations_source_merged,
                        how="outer",
                        on='lead_id')
merged_reliability.columns

Index(['analysis_framework_id', 'lead_id', 'project_id', 'reliability',
       'verified', 'source_type_x', 'url', 'website', 'reliability_score',
       'source_id', 'organization_title', 'title', 'source_raw',
       'source_type_y', 'lead_group_id', 'author_raw', 'author_id'],
      dtype='object')

In [13]:
merged_reliability.shape

(166799, 17)

In [14]:
merged_reliability.head(2)

Unnamed: 0,analysis_framework_id,lead_id,project_id,reliability,verified,source_type_x,url,website,reliability_score,source_id,organization_title,title,source_raw,source_type_y,lead_group_id,author_raw,author_id
0,137.0,6334.0,322.0,[Usually reliable],False,website,https://reliefweb.int/sites/reliefweb.int/file...,reliefweb.int,0.75,,,,,,,,
1,137.0,6334.0,322.0,[Usually reliable],False,website,https://reliefweb.int/sites/reliefweb.int/file...,reliefweb.int,0.75,,,,,,,,


In [15]:
clean_reliability = merged_reliability[((~merged_reliability.organization_title.isna())
                                      & (~merged_reliability.reliability_score.isna())
                                       )
                                      ].drop(
    columns=['reliability', 'url', 'title', 'lead_group_id', 'website',
             'verified', 'project_id', 'analysis_framework_id', 'source_id', 'source_type_x']
)
clean_reliability

Unnamed: 0,lead_id,reliability_score,organization_title,source_raw,source_type_y,author_raw,author_id
26,41125.0,0.75,Assessment Capacities Project,,website,,
27,41125.0,0.75,Assessment Capacities Project,,website,,
28,41125.0,0.75,Assessment Capacities Project,,website,,
29,41125.0,0.75,Assessment Capacities Project,,website,,
30,41125.0,0.75,Assessment Capacities Project,,website,,
...,...,...,...,...,...,...,...
156680,22801.0,0.75,Redhum,,website,,3310.0
156689,32843.0,0.75,Redhum,,website,,3006.0
156690,39282.0,0.75,ReliefWeb,,website,,6910.0
156773,49659.0,0.75,ReliefWeb,,website,,


In [16]:
print('nb of different organizations:', clean_reliability.organization_title.unique().shape[0])

nb of different organizations: 950


In [34]:
organisation_names_df = pd.read_csv('organisations_classified.csv', index_col=0)
organisation_names_df['org_rank'] = organisation_names_df.index
organisation_names_df = organisation_names_df[(organisation_names_df['Publish Articles']) | organisation_names_df['Collect Articles']]
publish_organisations = organisation_names_df[organisation_names_df['Publish Articles']]
publish_organisations

Unnamed: 0,Organizations Names,Collect Articles,Publish Articles,Remarks,org_rank
,,,,,
5,dhakatribune,False,True,,5
7,International Organization for Migration,False,True,,7
8,impact-repository,False,True,,8
9,United Nations Office for the Coordination of ...,False,True,"9,17 are repetitive.",9
10,UNHCR,False,True,"10,13,26,66 are repetitive.",10
...,...,...,...,...,...
610,La Cuarta,False,True,,610
611,ecupunto,False,True,,611
612,UNHCR Innovation,False,True,,612


In [35]:
organisation_names_df

Unnamed: 0,Organizations Names,Collect Articles,Publish Articles,Remarks,org_rank
,,,,,
1,ReliefWeb,True,False,"1,6,414 are repetitive.",1
2,humanitarianresponse,True,False,"2,4 are repetitive.",2
3,Redhum,True,False,"3,80 are repetitive.",3
4,Humanitarian Response,True,False,,4
5,dhakatribune,False,True,,5
...,...,...,...,...,...
610,La Cuarta,False,True,,610
611,ecupunto,False,True,,611
612,UNHCR Innovation,False,True,,612


In [36]:
with_remarks_publish_organisations = organisation_names_df[organisation_names_df.Remarks.apply(lambda x: str(x)!='nan')]
list_remarks_links = with_remarks_publish_organisations.Remarks.tolist()
list_remarks_links = [s.replace(',', ', ') for s in list_remarks_links if 'https' not in s]
print(np.unique(list_remarks_links))
linked_numbers = ' '.join(with_remarks_publish_organisations.Remarks).replace(',', ', ')
list_repetitive = re.findall(
    "[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", 
    linked_numbers
    )
list_repetitive = [int(s) for s in list_repetitive]

['1, 6, 414 are repetitive.' '10, 13, 26, 66 are repetitive.'
 '102, 123 are same.' '119 and 179 are same.' '137, 192 are repetitive.'
 '144,  214 are same.' '16,  75 are repetitive.' '172, 256 are same.'
 '19, 90 are repetitive.' '199,  213 are same.' '2, 4 are repetitive.'
 '27,  95,  96 are repetitive.' '3, 80 are repetitive.'
 '41, 43,  160 are repetitive.' '42, 134 are repetitive.'
 '67, 86 are repetitive.' '9, 17 are repetitive.' 'same as 1, 6'
 'same as 101,  237' 'same as 310' 'same as 310,  346' 'same as 315'
 'same as 331' 'same as 443']


In [25]:
0.75-0.125

0.625

In [18]:
treated_df = clean_reliability.groupby('organization_title', as_index=False).agg({
    'reliability_score': 'mean',
    'source_type_y': 'count'
}).rename(columns={'source_type_y': 'number of occurences'}
).sort_values(by='number of occurences', ascending=False)
treated_df = treated_df[treated_df['number of occurences']>5]
treated_df.head(10).reset_index(drop=True, inplace=False)

Unnamed: 0,organization_title,reliability_score,number of occurences
0,ReliefWeb,0.748482,39868
1,humanitarianresponse,0.747556,14118
2,Redhum,0.748878,4234
3,Humanitarian Response,0.744434,3054
4,dhakatribune,0.75,2729
5,Reliefweb,0.749253,2342
6,International Organization for Migration,0.760995,2160
7,impact-repository,0.75,1886
8,United Nations Office for the Coordination of ...,0.752005,1870
9,UNHCR,0.751917,1565


In [43]:
final_reliability_df = pd.merge(
    right=organisation_names_df,
    left=treated_df,
    left_on='organization_title',
    right_on='Organizations Names'
).drop(['Organizations Names'], axis=1, inplace=False)
final_reliability_df.head()

Unnamed: 0,organization_title,reliability_score,number of occurences,Collect Articles,Publish Articles,Remarks,org_rank
0,ReliefWeb,0.748482,39868,True,False,"1,6,414 are repetitive.",1
1,humanitarianresponse,0.747556,14118,True,False,"2,4 are repetitive.",2
2,Redhum,0.748878,4234,True,False,"3,80 are repetitive.",3
3,Humanitarian Response,0.744434,3054,True,False,,4
4,dhakatribune,0.75,2729,False,True,,5


In [44]:
final_reliability_df[final_reliability_df.Remarks.apply(lambda x: '1791' in str(x))]

Unnamed: 0,organization_title,reliability_score,number of occurences,Collect Articles,Publish Articles,Remarks,org_rank


In [45]:
def get_list_repetitive(row):
    remarks_tmp = row.Remarks
    if str(remarks_tmp) == "nan" or "http" in str(remarks_tmp):
        return [row['org_rank']]
    else:
        number_repetitive = re.findall(
            "[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?",
            remarks_tmp.replace(",", ", "),
        )
        number_repetitive = [int(s) for s in number_repetitive]
        if row['org_rank'] not in number_repetitive:
            number_repetitive += [row['org_rank']] 
        return number_repetitive

final_reliability_df['numbers'] = final_reliability_df.apply(lambda x: get_list_repetitive(x), axis=1)
final_reliability_df


Unnamed: 0,organization_title,reliability_score,number of occurences,Collect Articles,Publish Articles,Remarks,org_rank,numbers
0,ReliefWeb,0.748482,39868,True,False,"1,6,414 are repetitive.",1,"[1, 6, 414]"
1,humanitarianresponse,0.747556,14118,True,False,"2,4 are repetitive.",2,"[2, 4]"
2,Redhum,0.748878,4234,True,False,"3,80 are repetitive.",3,"[3, 80]"
3,Humanitarian Response,0.744434,3054,True,False,,4,[4]
4,dhakatribune,0.750000,2729,False,True,,5,[5]
...,...,...,...,...,...,...,...,...
586,La Cuarta,0.750000,6,False,True,,610,[610]
587,ecupunto,0.750000,6,False,True,,611,[611]
588,UNHCR Innovation,0.750000,6,False,True,,612,[612]
589,la-croix,0.750000,6,False,True,https://www.lacroixwater.com/,613,[613]


In [46]:
def merge(lsts):
    sets = [set(lst) for lst in lsts if lst]
    merged = True
    while merged:
        merged = False
        results = []
        while sets:
            common, rest = sets[0], sets[1:]
            sets = []
            for x in rest:
                if x.isdisjoint(common):
                    sets.append(x)
                else:
                    merged = True
                    common |= x
            results.append(common)
        sets = results
    return sets

merged_list = merge(final_reliability_df.numbers.tolist())
to_be_changed_rows = [list(l) for l in merged_list if len(l)>1]
for similar_list in to_be_changed_rows:
    final_score = 0
    total_occurences = 0
    for i in similar_list:
        row = final_reliability_df[final_reliability_df['org_rank']==i]
        nb_occurences = row['number of occurences'].values[0]
        reliability = row['reliability_score'].values
        final_score += reliability * nb_occurences
        total_occurences += nb_occurences
    final_score = final_score / total_occurences
    for i in similar_list:
        final_reliability_df.loc[final_reliability_df['org_rank']==i, 'reliability_score'] = final_score
final_reliability_df[final_reliability_df['org_rank'].isin([19, 90 ])]

Unnamed: 0,organization_title,reliability_score,number of occurences,Collect Articles,Publish Articles,Remarks,org_rank,numbers
18,REACH Initiative,0.747711,865,False,True,"19,90 are repetitive.",19,"[19, 90]"
86,REACH,0.747711,118,False,True,,90,[90]


In [51]:
def score_to_reliability(score):
    if score >0.875:
        return 'Completely reliable'
    elif score>0.625:
        return 'Usually reliable'
    elif score>0.375:
        return 'Fairly Reliable'
    elif score>0.125:
        return 'Not Usually Reliable'
    else:
        return 'Unreliable'

In [56]:
final_reliability_df['reliability'] = final_reliability_df.reliability_score.apply(score_to_reliability)
final_reliability_df.index = final_reliability_df['organization_title']
final_reliability_df

Unnamed: 0_level_0,organization_title,reliability_score,number of occurences,Collect Articles,Publish Articles,Remarks,org_rank,numbers,reliability
organization_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ReliefWeb,ReliefWeb,0.748597,39868,True,False,"1,6,414 are repetitive.",1,"[1, 6, 414]",Usually reliable
humanitarianresponse,humanitarianresponse,0.747001,14118,True,False,"2,4 are repetitive.",2,"[2, 4]",Usually reliable
Redhum,Redhum,0.748913,4234,True,False,"3,80 are repetitive.",3,"[3, 80]",Usually reliable
Humanitarian Response,Humanitarian Response,0.747001,3054,True,False,,4,[4],Usually reliable
dhakatribune,dhakatribune,0.750000,2729,False,True,,5,[5],Usually reliable
...,...,...,...,...,...,...,...,...,...
La Cuarta,La Cuarta,0.750000,6,False,True,,610,[610],Usually reliable
ecupunto,ecupunto,0.750000,6,False,True,,611,[611],Usually reliable
UNHCR Innovation,UNHCR Innovation,0.750000,6,False,True,,612,[612],Usually reliable
la-croix,la-croix,0.750000,6,False,True,https://www.lacroixwater.com/,613,[613],Usually reliable


In [59]:
authoring_df = final_reliability_df[final_reliability_df['Collect Articles']][
    ['reliability']
    ]
publishing_df = final_reliability_df[final_reliability_df['Publish Articles']][
    ['reliability']
    ]
publishing_df

Unnamed: 0_level_0,reliability
organization_title,Unnamed: 1_level_1
dhakatribune,Usually reliable
International Organization for Migration,Usually reliable
impact-repository,Usually reliable
United Nations Office for the Coordination of Humanitarian Affairs,Usually reliable
UNHCR,Usually reliable
...,...
La Cuarta,Usually reliable
ecupunto,Usually reliable
UNHCR Innovation,Usually reliable
la-croix,Usually reliable


In [62]:
authoring_org_dict = authoring_df.to_dict()
publishing_org_dict = publishing_df.to_dict()


In [63]:
reliability_dict = {
    'Authoring Organizations': authoring_org_dict['reliability'],
    'Publishing Organizations': publishing_org_dict['reliability']
    }


In [70]:
import codecs
import json

with codecs.open('reliability_dict_v2.txt', 'w', encoding='utf-8') as f:
    json.dump(reliability_dict, f, ensure_ascii=False)


In [69]:
import json

with open('reliability_dict.json', 'w') as fp:
    json.dump(reliability_dict, fp)

In [90]:
final_reliability_df[['organization_title', 'reliability_score', 'number of occurences']].to_csv('organisations_reliability_scores.csv', index=None)

In [69]:
final_reliability_df[final_reliability_df['rank'].isin([19, 90 ])]

Unnamed: 0,organization_title,reliability_score,number of occurences,Remarks,rank,numbers
13,REACH Initiative,0.747399,865,"19,90 are repetitive.",19,"[19, 90, 19]"
80,REACH,0.75,118,,90,[90]


In [26]:
treated_df = treated_df[treated_df['number of occurences']>5]
mylist = treated_df.organization_title.tolist()
myfile = "organisations.txt"
with open(myfile, 'w') as f:
    f.write("\n".join(mylist))

Unnamed: 0,author_id,reliability_score,counts
300,4558.0,0.752018,1115
144,3310.0,0.750996,1004
388,6949.0,0.752786,718
283,4307.0,0.750000,704
149,3400.0,0.772005,409
...,...,...,...
32,1168.0,0.750000,11
328,5014.0,0.750000,11
317,4945.0,0.750000,11
224,3619.0,0.750000,11
