# Project Data Sample Review 

In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
%matplotlib inline
pd.set_option('max_rows',300)

In [2]:
projects=pd.read_csv('../Data/EWS_Project_Listing_01JUN17-31MAY18.csv')
projects = projects[projects['EWS ID'].notnull()]

In [3]:
projects.shape

(2954, 77)

** Example Row **

In [4]:
projects.iloc[0]

EWS ID                                                               32549
ProjectNumber                                                    FMO-53557
Published                                                        Published
Bank Risk Rating                                                         B
Project Status                                                    Approved
EWS URL                  https://ews.rightsindevelopment.org/projects/5...
Detailed Analysis URL                                                  NaN
Project Name             SUNFARMING EURASIA ASSET ENERJI YATIRIMLARI VE...
City                                                                   NaN
Country Count                                                            1
Country 1                                                           Turkey
Country 2                                                              NaN
Country 3                                                              NaN
Country 4                

** Null Check **

In [5]:
projects.count()/len(projects.index)

EWS ID                   1.000000
ProjectNumber            1.000000
Published                1.000000
Bank Risk Rating         1.000000
Project Status           0.957684
EWS URL                  1.000000
Detailed Analysis URL    0.000000
Project Name             1.000000
City                     0.193297
Country Count            1.000000
Country 1                0.899797
Country 2                0.040961
Country 3                0.020650
Country 4                0.013541
Country 5                0.010156
Country 6                0.005755
Country 7                0.003047
Country 8                0.001693
Country 9                0.001693
Country 10               0.001693
Country 11               0.001354
Country 12               0.001016
Borrower or Client       0.851388
Private Actor Count      1.000000
Private Actor 1          0.152674
Private Actor 2          0.030467
Private Actor 3          0.008802
Private Actor 4          0.005416
Private Actor 5          0.003724
Private Actor 

# Compare to the Labeled Data 

In [12]:
labels = pd.read_csv('../Temp_Output/june11_temp_labeled.csv')

In [34]:
project_labels = labels[ labels['EWS hyperlink'].notnull()]
project_labels = project_labels[project_labels['EWS hyperlink'].str.contains('https://ews')]

In [41]:
project_ids = []
for i in project_labels['Projects(EWSProjectID)']:
    project_ids += [j.strip() for j in  i.split(',')]

In [47]:
project_ids = list(set(project_ids))
print(len(project_ids))

45


In [54]:
set(project_ids) - set(projects.ProjectNumber.unique())

{'ADB-41435-053',
 'ADB-48226-002',
 'ADB-50410-001',
 'AIIB-000079',
 'EIB-20140596',
 'EIB-20140645',
 'EIB-20150676',
 'EIB-20160816',
 'Figeac Aero Regional',
 'IADB-EC-L1111',
 'IIC-12063-02',
 'Tranche 2 in EWS',
 'WB-P146330',
 'WB-P148775',
 'WB-P156241',
 'WB-P160383',
 'WB-P160408',
 'WB-P161234',
 'WB-P162422',
 'missing Tranche 3?: https://www.adb.org/projects/36330-043/main#project-overview'}

## Project Description Column will Likely Be most Useful for Matching 

**Notes**
    * Some descriptions are pretty short - not sure how easy it will be to match to those
    * Some of the other fields will likely be useful (Country, Borrower or Client, etc.)

In [6]:
for i in projects.sample(15)['Project Description']:
    print(i)
    print('*****\n')

The proposed project involves IFC investing an INR equivalent of approximately US$50 million in a combination of instruments in one or more Special Purpose Vehicles ("SPV") promoted by Mahindra Lifespace Developers Limited ("MLDL" or the "Company" or the "Sponsor") set up for the development of three industrial clusters (ICs) around established industrial areas in Rajasthan, Gujarat, and Maharashtra (the "Project").
*****

The objective of the project is to adapt and scale a loan product to finance water and sanitation solutions to poor and vulnerable population in Brazil.
*****

The proposed financing consists of two separate long-term loans to be granted directly to Durlicouros Industria e Comercio de Couros, Exportacao e Importacao Ltda. in Brazil and to Durli Leather S.A. and Veneza Inversiones S.A. in Paraguay. The Borrowers will use the proceeds of the loans to support the construction, operation, and maintenance of two (2) new leather manufacturing plants (one (1) in Brazil and 

## Looking at the Sector Data 

This could be a dataset that could help in the tagging of the news articles with (Sector Infortmation)

In [7]:
def get_category_cols(category, additional_removes=None):
    cols = [i for i in projects.columns if category in i]
    cols.remove(category + ' Count')
    if additional_removes:
        [cols.remove(i) for i in additional_removes]
    return cols

sector_cols = get_category_cols('Sector')   
all_sectors = projects[sector_cols].as_matrix().flatten()


In [8]:
for i in Counter(all_sectors): print(i)

nan
Transport
Hydropower
Infrastructure
Climate and Environment
Finance
Industry and Trade
Humanitarian Response
Construction
Education and Health
Communications
Mining
Law and Government
Energy
Technical Cooperation
Agriculture and Forestry
Water and Sanitation


# NOTE 

Might be able to use this to also classify the Bank and Country 

**Countries**

In [9]:
country_cols = get_category_cols('Country')
all_countries = projects[country_cols].as_matrix().flatten()
country_counter = Counter(all_countries)
print(len(country_counter))
country_counter

170


Counter({'Afghanistan': 25,
         'Albania': 10,
         'Angola': 6,
         'Argentina': 57,
         'Armenia': 26,
         'Austria': 11,
         'Azerbaijan': 15,
         'Bahamas': 8,
         'Bangladesh': 64,
         'Barbados': 3,
         'Belarus': 7,
         'Belgium': 10,
         'Belize': 9,
         'Benin': 6,
         'Bhutan': 17,
         'Bolivia': 30,
         'Bosnia and Herzegovina': 15,
         'Brazil': 79,
         'Bulgaria': 9,
         'Burkina Faso': 7,
         'Burundi': 1,
         'Cambodia': 42,
         'Cameroon': 11,
         'Cape Verde': 3,
         'Central African Republic': 7,
         'Chad': 10,
         'Chile': 22,
         'China': 128,
         'Colombia': 53,
         'Congo, Democratic Republic of': 7,
         'Congo, Republic of': 3,
         'Cook Islands': 1,
         'Costa Rica': 23,
         'Croatia': 13,
         'Cyprus': 1,
         'Czech Republic': 11,
         'Denmark': 9,
         'Djibouti': 3,
         'Do

**Banks**

In [10]:
banks_cols = get_category_cols('Bank', ['Bank Risk Rating'])
all_banks = projects[banks_cols].as_matrix().flatten()
all_banks = [i for i  in all_banks if  pd.notnull(i)] ## Something weird with the nulls in this one. 
banks_counter = Counter(all_banks)
print(len(banks_counter))
banks_counter

13


Counter({'African Development Bank (AFDB)': 45,
         'Asian Development Bank (ADB)': 644,
         'Asian Infrastructure Investment Bank (AIIB)': 51,
         'European Bank for Reconstruction and Development (EBRD)': 179,
         'European Investment Bank (EIB)': 462,
         'Green Climate Fund (GCF)': 60,
         'Inter-American Development Bank (IADB)': 574,
         'Inter-American Investment Corporation (IIC)': 67,
         'International Finance Corporation (IFC)': 332,
         'Multilateral Investment Guarantee Agency (MIGA)': 39,
         'Netherlands Development Finance Company (FMO)': 206,
         'New Development Bank (NDB)': 12,
         'World Bank (WB)': 337})

In [19]:
for b in banks_counter: print(b)

Netherlands Development Finance Company (FMO)
Asian Development Bank (ADB)
Inter-American Investment Corporation (IIC)
New Development Bank (NDB)
Asian Infrastructure Investment Bank (AIIB)
World Bank (WB)
Inter-American Development Bank (IADB)
African Development Bank (AFDB)
European Bank for Reconstruction and Development (EBRD)
European Investment Bank (EIB)
International Finance Corporation (IFC)
Multilateral Investment Guarantee Agency (MIGA)
Green Climate Fund (GCF)
