# Project Data Sample Review 

In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
%matplotlib inline
pd.set_option('max_rows',300)

In [2]:
projects=pd.read_csv('../Data/EWS_Published Project_Listing_DD.csv', encoding='ISO-8859-1')
projects = projects[projects['EWS ID'].notnull()]

In [3]:
projects.shape

(6847, 62)

In [78]:
projects.head()

Unnamed: 0,EWS ID,ProjectNumber,Published,Bank Risk Rating,Project Status,EWS URL,Detailed Analysis URL,Project Name,City,Country Count,...,Sector 7,Last Edited,Date Scraped,Date Disclosed,Board Date,Source URL,Project Cost,Investment Amount,Project Description,Contact Information
0,29164,AFDB-P-TN-BB0-007,Published,U,Proposed,https://ews.rightsindevelopment.org/projects/p...,,TUNISIA FERTILIZER PROJECT,,1.0,...,,9/4/17,8/15/17,12/13/01,12/13/01,http://www.afdb.org/en/projects-and-operations...,,,,ACCOUNTABILITY MECHANISM OF AfDB\r\r\r\rThe In...
1,29166,AFDB-P-SZ-HAA-001,Published,U,Approved,https://ews.rightsindevelopment.org/projects/p...,,LINE OF CREDIT TO SWAZILAND DEVELOPMENT FINANC...,,1.0,...,,9/4/17,8/15/17,12/13/01,5/12/17,http://www.afdb.org/en/projects-and-operations...,4.76,1.36,,MACHARIA Lilian Wanjiru - PIFD1\r\r\r\rACCOUNT...
2,29931,IADB-UR-T1100,Pending,C,Approved,https://ews.rightsindevelopment.org/projects/u...,,Supporting INEFOP in Improving Labor Training ...,,1.0,...,,,10/3/17,12/31/99,7/16/13,http://www.iadb.org/en/projects/project-descri...,0.44,0.44,,
3,30104,IADB-BR-T1279,Pending,C,Approved,https://ews.rightsindevelopment.org/projects/b...,,"Racial Equality and Social, Economic, Politica...",,1.0,...,,,10/3/17,12/31/99,6/4/13,http://www.iadb.org/en/projects/project-descri...,0.97,0.82,,
4,30322,IADB-PE-T1297,Pending,C,Approved,https://ews.rightsindevelopment.org/projects/p...,,Adaptation to Climate Change of the Fishery Se...,,1.0,...,,,10/3/17,12/31/99,12/4/13,http://www.iadb.org/en/projects/project-descri...,1.5,1.5,,


** Null Check **

In [5]:
projects.count()/len(projects.index)

EWS ID                   1.000000
ProjectNumber            0.998832
Published                0.998832
Bank Risk Rating         0.998832
Project Status           0.936469
EWS URL                  0.998832
Detailed Analysis URL    0.000000
Project Name             0.998686
City                     0.169563
Country Count            0.998832
Country 1                0.908719
Country 2                0.035782
Country 3                0.019863
Country 4                0.011830
Country 5                0.007741
Country 6                0.005112
Country 7                0.003359
Country 8                0.001899
Country 9                0.001607
Country 10               0.001460
Country 11               0.001022
Country 12               0.000584
Borrower or Client       0.760917
Private Actor Count      0.997955
Private Actor 1          0.077114
Private Actor 2          0.017964
Private Actor 3          0.006572
Private Actor 4          0.003067
Private Actor 5          0.002191
Private Actor 

## Project Description Column will Likely Be most Useful for Matching 

**Notes**
    * Some descriptions are pretty short - not sure how easy it will be to match to those
    * Some of the other fields will likely be useful (Country, Borrower or Client, etc.)

In [3]:
# for i in projects.sample(15)['Project Description']:
#     print(i)
#     print('*****\n')

## Looking at the Sector Data 

This could be a dataset that could help in the tagging of the news articles with (Sector Infortmation)

In [4]:
def get_category_cols(category, additional_removes=None):
    cols = [i for i in projects.columns if category in i]
    cols.remove(category + ' Count')
    if additional_removes:
        [cols.remove(i) for i in additional_removes]
    return cols

sector_cols = get_category_cols('Sector')   
all_sectors = projects[sector_cols].as_matrix().flatten()


In [5]:
for i in Counter(all_sectors): print(i)

Agriculture and Forestry
nan
Finance
Industry and Trade
Education and Health
Law and Government
Technical Cooperation
Water and Sanitation
Communications
Transport
Construction
Infrastructure
Hydropower
Climate and Environment
Energy
Mining
Humanitarian Response


# NOTE 

Might be able to use this to also classify the Bank and Country 

**Countries**

In [9]:
country_cols = get_category_cols('Country')
all_countries = projects[country_cols].as_matrix().flatten()
country_counter = Counter(all_countries)
print(len(country_counter))
country_counter

180


Counter({'Afghanistan': 48,
         'Albania': 27,
         'Algeria': 3,
         'Angola': 8,
         'Argentina': 163,
         'Armenia': 50,
         'Austria': 24,
         'Azerbaijan': 31,
         'Bahamas': 30,
         'Bangladesh': 140,
         'Barbados': 14,
         'Belarus': 17,
         'Belgium': 25,
         'Belize': 17,
         'Benin': 13,
         'Bhutan': 36,
         'Bolivia': 106,
         'Bosnia and Herzegovina': 34,
         'Botswana': 5,
         'Brazil': 213,
         'Bulgaria': 22,
         'Burkina Faso': 23,
         'Burundi': 9,
         'Cambodia': 72,
         'Cameroon': 26,
         'Canada': 1,
         'Cape Verde': 6,
         'Central African Republic': 16,
         'Chad': 17,
         'Chile': 82,
         'China': 255,
         'Colombia': 168,
         'Comoros': 2,
         'Congo, Democratic Republic of': 26,
         'Congo, Republic of': 9,
         'Cook Islands': 1,
         'Costa Rica': 49,
         'Croatia': 37,
      

**Banks**

In [10]:
banks_cols = get_category_cols('Bank', ['Bank Risk Rating'])
all_banks = projects[banks_cols].as_matrix().flatten()
all_banks = [i for i  in all_banks if  pd.notnull(i)] ## Something weird with the nulls in this one. 
banks_counter = Counter(all_banks)
print(len(banks_counter))
banks_counter

13


Counter({'African Development Bank (AFDB)': 86,
         'Asian Development Bank (ADB)': 1159,
         'Asian Infrastructure Investment Bank (AIIB)': 62,
         'European Bank for Reconstruction and Development (EBRD)': 496,
         'European Investment Bank (EIB)': 1029,
         'Green Climate Fund (GCF)': 72,
         'Inter-American Development Bank (IADB)': 1585,
         'Inter-American Investment Corporation (IIC)': 135,
         'International Finance Corporation (IFC)': 1057,
         'Multilateral Investment Guarantee Agency (MIGA)': 88,
         'Netherlands Development Finance Company (FMO)': 249,
         'New Development Bank (NDB)': 13,
         'World Bank (WB)': 888})

In [11]:
for b in banks_counter: print(b)

New Development Bank (NDB)
European Bank for Reconstruction and Development (EBRD)
Asian Infrastructure Investment Bank (AIIB)
European Investment Bank (EIB)
Inter-American Investment Corporation (IIC)
Multilateral Investment Guarantee Agency (MIGA)
Inter-American Development Bank (IADB)
Green Climate Fund (GCF)
International Finance Corporation (IFC)
Netherlands Development Finance Company (FMO)
World Bank (WB)
Asian Development Bank (ADB)
African Development Bank (AFDB)


------------

# Compare to the Labeled Data 

In [6]:
feedly = pd.read_pickle('../Data/Feedly_Processed_DF.pkl')

In [7]:
labels = pd.read_excel('../Data/Feedly_Labeled_Data.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: '../Data/Feedly_Labeled_Data.xlsx'

In [20]:
labels = labels[labels.article_id.isin(feedly.article_id)]
print(labels.shape)

(464, 16)


-----------

## Projects 

**Valid EWS HyperLink**

We have some repeat projects 

In [21]:
labels[(labels['EWS hyperlink'].notnull()) & (labels['EWS hyperlink'].str.contains('//ews')) ]['EWS hyperlink'].value_counts().head()

https://ewsdata.rightsindevelopment.org/projects/20150676-tanap-trans-anatolian-natural-gas-pipeline/                                                                                                                                                                                                 16
https://ewsdata.rightsindevelopment.org/projects/50410-001-asean-distributed-power-project-initial-poverty-a/                                                                                                                                                                                          8
https://ewsdata.rightsindevelopment.org/projects/00057-bangladesh-bhola-ipp/                                                                                                                                                                                                                           6
https://ewsdata.rightsindevelopment.org/projects/000015-bangladesh-natural-gas-infrastructure-and-efficie/; h

We have this Many Unique Projects that were matched to Articles 

In [22]:
labels[(labels['EWS hyperlink'].notnull()) & (labels['EWS hyperlink'].str.contains('//ews')) ]['EWS hyperlink'].nunique()

41

In [23]:
project_labeled_data = labels.copy(deep=True)
project_labeled_data = project_labeled_data[project_labeled_data.Matched.notnull()]
print(project_labeled_data.shape)

(115, 16)


### Clean the ID Columns for unmatched Projects 

In [24]:
project_labeled_data.loc[project_labeled_data.Matched == 0,'Projects(EWSProjectID)'] = np.nan
project_labeled_data.loc[project_labeled_data.Matched == 0,'EWS Project Name'] = np.nan
project_labeled_data.loc[project_labeled_data.Matched == 0,'EWS hyperlink'] = np.nan



**Process to Check How Many We can Match ** 

In [25]:
project_labels = project_labeled_data[project_labeled_data.Matched ==1]
project_labels = project_labels[project_labels['EWS hyperlink'].str.contains('https://ews')]

In [26]:
project_ids = []
for i in project_labels['Projects(EWSProjectID)']:
    project_ids += [j.strip() for j in  i.split(',')]

In [27]:
project_ids = list(set(project_ids))
print(len(project_ids))

46


In [28]:
print(len(set(project_ids) & set(projects.ProjectNumber.unique())), 'are matched')

46 are matched


-----------

# Sector 

```Communications
Construction
Hydropower
Infrastructure
Education and Health
Law and Government
Energy
Agriculture and Forestry
Finance
Climate and Environment
Humanitarian Response
Water and Sanitation
Industry and Trade
Mining
Technical Cooperation
Transport```

In [29]:
def clean(x):
    try:
        return x.lower().strip()
    except:
        return 'NA'

In [30]:
sector_data = labels[labels.Sectors.notnull()].copy(deep=True)
sector_data.loc[:,'cl_Sector'] = sector_data.Sectors.apply(clean)

sector_dict={0:'MISC','not a project':'MISC','NA':'MISC'}
for k in sector_dict:
    sector_data.loc[sector_data.cl_Sector == k, 'cl_Sector'] = sector_dict[k]


In [31]:
sector_data[['Sectors','cl_Sector']].head()

Unnamed: 0,Sectors,cl_Sector
0,Infrastructure,infrastructure
1,Energy,energy
3,Transport,transport
4,"Construction, Finance","construction, finance"
5,"Construction, Finance","construction, finance"


In [32]:
sector_data.cl_Sector.value_counts(dropna=False)

energy                                                91
finance                                               45
infrastructure                                        41
water and sanitation                                  29
transport                                             29
MISC                                                  12
education and health                                  12
industry and trade                                    10
communications                                         9
construction, finance                                  8
hydropower                                             5
infrastructure, transport                              3
agriculture and forestry, industry and trade           3
water and sanitation, climate                          3
finance, energy                                        3
agriculture and forestry                               2
transport, industry and trade                          2
humanitarian response          

In [33]:
sector_data['top_sector'] = [i.split(',')[0] for i in sector_data.cl_Sector]

In [34]:
sector_data.top_sector.value_counts()

energy                      95
finance                     50
infrastructure              46
transport                   33
water and sanitation        32
MISC                        12
education and health        12
industry and trade          11
communications               9
construction                 8
agriculture and forestry     5
hydropower                   5
humanitarian response        1
Name: top_sector, dtype: int64

----------

## Banks 

```
Asian Infrastructure Investment Bank (AIIB)
African Development Bank (AFDB)
Asian Development Bank (ADB)
Green Climate Fund (GCF)
New Development Bank (NDB)
European Investment Bank (EIB)
World Bank (WB)
International Finance Corporation (IFC)
Inter-American Investment Corporation (IIC)
Netherlands Development Finance Company (FMO)
Inter-American Development Bank (IADB)
Multilateral Investment Guarantee Agency (MIGA)
European Bank for Reconstruction and Development (EBRD)
```

In [35]:
banks_data = labels[labels.Bank1.notnull()]
banks_data.Bank1 = banks_data.Bank1.apply(clean)
banks_data.Bank2 = banks_data.Bank2.apply(clean)

In [37]:
Counter(banks_data.Bank1.tolist() + banks_data.Bank2.tolist())

Counter({'NA': 452,
         'adb': 228,
         'afdb': 91,
         'aiib': 51,
         'ebrd': 70,
         'eib': 15,
         'gcf': 1,
         'idb': 5,
         'ifc': 2,
         'indonesia': 1,
         'jica': 3,
         'pakistan': 1,
         'wb': 8})

---------

## Country 

In [39]:
country_data = labels[labels.Country1.notnull()]
country_data.is_copy = False
country_data.Country1 = country_data.Country1.apply(clean)
country_data.Country2 = country_data.Country2.apply(clean)

In [40]:
country_data.Country1.value_counts()

india                                                                                                       74
philippines                                                                                                 28
pakistan                                                                                                    23
turkey                                                                                                      20
bangladesh                                                                                                  19
nigeria                                                                                                     18
mongolia                                                                                                    10
thailand                                                                                                    10
sri lanka                                                                                                   10
e

In [41]:
country_data = country_data[country_data.Country1 != 'NA']
print(country_data.shape)

(371, 16)


In [42]:
country_data = country_data.drop('Country2',axis=1)

## Clean files 

In [47]:
drop_list = ['DK_ADMIN_TRANSFERRED_CSV','HyperLink']

In [55]:
output_files = {}
for df in [('banks',banks_data),( 'sectors',sector_data), ('countries',country_data),( 'projects',project_labeled_data)]:
    out_df = df[1].drop(drop_list, axis=1).copy()
    if df[0] == 'banks':
        out_df = out_df.drop(['Sectors','Country1','Country2','Matched','Projects(EWSProjectID)','EWS Project Name',
                             'EWS hyperlink'],axis=1)
    if df[0] == 'sectors':
        out_df = out_df.drop(['Bank1','Bank2','Country1','Country2','Matched','Projects(EWSProjectID)','EWS Project Name',
                             'EWS hyperlink'],axis=1)
    if df[0] == 'countries':
        out_df = out_df.drop(['Bank1','Bank2','Sectors','Matched','Projects(EWSProjectID)','EWS Project Name',
                             'EWS hyperlink'],axis=1)
    if df[0] == 'projects':
        out_df = out_df.drop(['Bank1','Bank2','Sectors','Country2','Country1'],axis=1)

    output_files[df[0]] = out_df

    #drop admin cols etc 

In [57]:
output_files['banks'].head()

Unnamed: 0,article_id,published,title,url,feed_label,Bank1,Bank2
0,10f9ed2,2018-01-11,ADB Provides Support for Three Infrastructure ...,http://moderndiplomacy.eu/2018/01/11/adb-provi...,NEWS ADB - All Streams,adb,
1,c0eece9b,2018-05-13,ADB Helps Inaugurate New Power Distribution Ne...,http://feedproxy.google.com/~r/adb_news/~3/2My...,NEWS ADB - All Streams,adb,
3,d1d79dd8,2018-02-20,ADB Provides $360 Million for Rolling Stock to...,http://feedproxy.google.com/~r/adb_news/~3/v9s...,NEWS ADB - All Streams,adb,
4,f0d65e5,2018-02-25,ADB provides financing to Thailand's B.Grimm P...,https://www.dealstreetasia.com/stories/adb-b-g...,NEWS ADB - All Streams,adb,
5,4a557358,2018-02-26,ADB's $235m loan to support B.Grimm Power expa...,https://www.power-technology.com/news/adbs-235...,NEWS ADB - All Streams,adb,


In [58]:
output_files['sectors'].head()

Unnamed: 0,article_id,published,title,url,feed_label,Sectors,cl_Sector,top_sector
0,10f9ed2,2018-01-11,ADB Provides Support for Three Infrastructure ...,http://moderndiplomacy.eu/2018/01/11/adb-provi...,NEWS ADB - All Streams,Infrastructure,infrastructure,infrastructure
1,c0eece9b,2018-05-13,ADB Helps Inaugurate New Power Distribution Ne...,http://feedproxy.google.com/~r/adb_news/~3/2My...,NEWS ADB - All Streams,Energy,energy,energy
3,d1d79dd8,2018-02-20,ADB Provides $360 Million for Rolling Stock to...,http://feedproxy.google.com/~r/adb_news/~3/v9s...,NEWS ADB - All Streams,Transport,transport,transport
4,f0d65e5,2018-02-25,ADB provides financing to Thailand's B.Grimm P...,https://www.dealstreetasia.com/stories/adb-b-g...,NEWS ADB - All Streams,"Construction, Finance","construction, finance",construction
5,4a557358,2018-02-26,ADB's $235m loan to support B.Grimm Power expa...,https://www.power-technology.com/news/adbs-235...,NEWS ADB - All Streams,"Construction, Finance","construction, finance",construction


In [59]:
output_files['countries'].head()

Unnamed: 0,article_id,published,title,url,feed_label,Country1
0,10f9ed2,2018-01-11,ADB Provides Support for Three Infrastructure ...,http://moderndiplomacy.eu/2018/01/11/adb-provi...,NEWS ADB - All Streams,cambodia
1,c0eece9b,2018-05-13,ADB Helps Inaugurate New Power Distribution Ne...,http://feedproxy.google.com/~r/adb_news/~3/2My...,NEWS ADB - All Streams,afghanistan
3,d1d79dd8,2018-02-20,ADB Provides $360 Million for Rolling Stock to...,http://feedproxy.google.com/~r/adb_news/~3/v9s...,NEWS ADB - All Streams,bangladesh
4,f0d65e5,2018-02-25,ADB provides financing to Thailand's B.Grimm P...,https://www.dealstreetasia.com/stories/adb-b-g...,NEWS ADB - All Streams,thailand
5,4a557358,2018-02-26,ADB's $235m loan to support B.Grimm Power expa...,https://www.power-technology.com/news/adbs-235...,NEWS ADB - All Streams,thailand


In [60]:
output_files['projects'].head()

Unnamed: 0,article_id,published,title,url,feed_label,Projects(EWSProjectID),EWS Project Name,EWS hyperlink,Matched
0,10f9ed2,2018-01-11,ADB Provides Support for Three Infrastructure ...,http://moderndiplomacy.eu/2018/01/11/adb-provi...,NEWS ADB - All Streams,"ADB-41123-015, ADB-48158-001, ADB-41435-053",Road Network Improvement Project (formerly Sec...,https://ewsdata.rightsindevelopment.org/projec...,1.0
1,c0eece9b,2018-05-13,ADB Helps Inaugurate New Power Distribution Ne...,http://feedproxy.google.com/~r/adb_news/~3/2My...,NEWS ADB - All Streams,ADB-47282-001,Energy Supply Improvement Investment Program (...,https://ewsdata.rightsindevelopment.org/projec...,1.0
3,d1d79dd8,2018-02-20,ADB Provides $360 Million for Rolling Stock to...,http://feedproxy.google.com/~r/adb_news/~3/v9s...,NEWS ADB - All Streams,ADB-50312-003,Railway Rolling Stock Operations Improvement P...,https://ewsdata.rightsindevelopment.org/projec...,1.0
4,f0d65e5,2018-02-25,ADB provides financing to Thailand's B.Grimm P...,https://www.dealstreetasia.com/stories/adb-b-g...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0
5,4a557358,2018-02-26,ADB's $235m loan to support B.Grimm Power expa...,https://www.power-technology.com/news/adbs-235...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0


In [66]:
for k in output_files:
    output_files[k].to_csv('../Data/Labeled_Data/'+k+'.csv',index=False)

In [79]:
feedly.count()/len(feedly.index)

article_id          1.000000
title               1.000000
url                 1.000000
keep                1.000000
feed_label          1.000000
content             0.059644
published           1.000000
summary             1.000000
scraped_content     1.000000
article_text        1.000000
article_keywords    1.000000
lang                0.921552
article_text_len    1.000000
top_lang            0.921552
dtype: float64

In [76]:
# index = 4000
# print(feedly.iloc[index])
# print(feedly.iloc[index].url)
# print('****')
# print(feedly.iloc[index].article_text)

In [87]:
output_files['projects']

Unnamed: 0,article_id,published,title,url,feed_label,Projects(EWSProjectID),EWS Project Name,EWS hyperlink,Matched
0,10f9ed2,2018-01-11,ADB Provides Support for Three Infrastructure ...,http://moderndiplomacy.eu/2018/01/11/adb-provi...,NEWS ADB - All Streams,"ADB-41123-015, ADB-48158-001, ADB-41435-053",Road Network Improvement Project (formerly Sec...,https://ewsdata.rightsindevelopment.org/projec...,1.0
1,c0eece9b,2018-05-13,ADB Helps Inaugurate New Power Distribution Ne...,http://feedproxy.google.com/~r/adb_news/~3/2My...,NEWS ADB - All Streams,ADB-47282-001,Energy Supply Improvement Investment Program (...,https://ewsdata.rightsindevelopment.org/projec...,1.0
3,d1d79dd8,2018-02-20,ADB Provides $360 Million for Rolling Stock to...,http://feedproxy.google.com/~r/adb_news/~3/v9s...,NEWS ADB - All Streams,ADB-50312-003,Railway Rolling Stock Operations Improvement P...,https://ewsdata.rightsindevelopment.org/projec...,1.0
4,f0d65e5,2018-02-25,ADB provides financing to Thailand's B.Grimm P...,https://www.dealstreetasia.com/stories/adb-b-g...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0
5,4a557358,2018-02-26,ADB's $235m loan to support B.Grimm Power expa...,https://www.power-technology.com/news/adbs-235...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0
6,153bd809,2018-02-23,B.Grimm Power secures ADB loan to support rene...,https://www.theasset.com/capital-markets/34144...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0
7,817a9fee,2018-02-23,B.Grimm Power secures ADB loan to support rene...,https://esg.theasset.com/ESG/34144/bgrimm-powe...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0
8,ed32373b,2018-02-27,ADB offers $235m loan to B.Grimm Power for ASE...,http://solar.cleantechnology-business-review.c...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0
9,1d18e02b,2018-02-27,B.Grimm Power secures ADB loan for ASEAN renew...,http://solar.energy-business-review.com/news/b...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0
10,5ae8d202,2018-02-22,"ADB, B.Grimm Power Expand Support for Renewabl...",http://feedproxy.google.com/~r/adb_news/~3/CHQ...,NEWS ADB - All Streams,ADB-50410-001,ASEAN Distributed Power Project: Initial Pover...,https://ewsdata.rightsindevelopment.org/projec...,1.0


--------------