# Project Data Sample Review 

In [40]:
import pandas as pd
import numpy as np
import os
from collections import Counter
%matplotlib inline
pd.set_option('max_rows',300)

In [17]:
projects=pd.read_csv('../Data/EWS_Project_Listing_01DEC17-31MAY18.csv')
projects = projects[projects['EWS ID'].notnull()]

In [18]:
projects.shape

(742, 62)

** Example Row **

In [28]:
projects.iloc[0]

EWS ID                                                               32549
ProjectNumber                                                    FMO-53557
Published                                                        Published
Bank Risk Rating                                                         B
Project Status                                                    Approved
EWS URL                  https://ews.rightsindevelopment.org/projects/5...
Detailed Analysis URL                                                  NaN
Project Name             SUNFARMING EURASIA ASSET ENERJI YATIRIMLARI VE...
City                                                                   NaN
Country Count                                                            1
Country 1                                                           Turkey
Country 2                                                              NaN
Country 3                                                              NaN
Country 4                

** Null Check **

In [21]:
projects.count()/len(projects.index)

EWS ID                   1.000000
ProjectNumber            1.000000
Published                1.000000
Bank Risk Rating         1.000000
Project Status           0.960916
EWS URL                  1.000000
Detailed Analysis URL    0.000000
Project Name             1.000000
City                     0.241240
Country Count            1.000000
Country 1                0.928571
Country 2                0.071429
Country 3                0.035040
Country 4                0.024259
Country 5                0.016173
Country 6                0.006739
Country 7                0.002695
Country 8                0.002695
Country 9                0.002695
Country 10               0.002695
Country 11               0.001348
Country 12               0.001348
Borrower or Client       0.900270
Private Actor Count      1.000000
Private Actor 1          0.435310
Private Actor 2          0.086253
Private Actor 3          0.022911
Private Actor 4          0.014825
Private Actor 5          0.009434
Private Actor 

## Project Description Column will Likely Be most Useful for Matching 

**Notes**
    * Some descriptions are pretty short - not sure how easy it will be to match to those
    * Some of the other fields will likely be useful (Country, Borrower or Client, etc.)

In [26]:
for i in projects.sample(15)['Project Description']:
    print(i)
    print('*****\n')

The project's objective is to provide the necessary conditions for the growth and competitiveness of businesses in Paraguay by supporting a network of impact oriented companies.
*****

This project establishes a co-investment facility with Intrum Justitia AB to support the acquisition and resolution of consumer unsecured and small business non-performing loans. The aim of this project is to support Greek banks, provide liquidity for new lending in the Grek economy, and mobilize funding from the private sector.
*****

From the IDB: "The objective of this Advisory Services Project (or Technical Cooperation), is to consolidate prior commitments and/or pre-approved activities so that the IDB Invest can fulfill its contractual obligations, as well as other verbal commitments made prior to 2018, and to meet any expectations from IDB Invest clients and stakeholders."
*****

This project provides financing for the development of nearly zero-energy residential buildings in Sweden.
*****

This p

## Looking at the Sector Data 

This could be a dataset that could help in the tagging of the news articles with (Sector Infortmation)

In [60]:
def get_category_cols(category, additional_removes=None):
    cols = [i for i in projects.columns if category in i]
    cols.remove(category + ' Count')
    if additional_removes:
        [cols.remove(i) for i in additional_removes]
    return cols

sector_cols = get_category_cols('Sector')   
all_sectors = projects[sector_cols].as_matrix().flatten()


In [61]:
Counter(all_sectors)

Counter({'Agriculture and Forestry': 95,
         'Climate and Environment': 50,
         'Communications': 25,
         'Construction': 141,
         'Education and Health': 68,
         'Energy': 116,
         'Finance': 197,
         'Humanitarian Response': 6,
         'Hydropower': 18,
         'Industry and Trade': 107,
         'Infrastructure': 44,
         'Law and Government': 34,
         'Mining': 5,
         'Technical Cooperation': 96,
         'Transport': 83,
         'Water and Sanitation': 70,
         nan: 4039})

# NOTE 

Might be able to use this to also classify the Bank and Country 

**Countries**

In [56]:
country_cols = get_category_cols('Country')
all_countries = projects[country_cols].as_matrix().flatten()
country_counter = Counter(all_countries)
print(len(country_counter))
country_counter

138


Counter({'Afghanistan': 3,
         'Albania': 7,
         'Angola': 3,
         'Argentina': 19,
         'Armenia': 9,
         'Austria': 2,
         'Azerbaijan': 1,
         'Bahamas': 3,
         'Bangladesh': 19,
         'Barbados': 1,
         'Belarus': 3,
         'Belgium': 6,
         'Belize': 1,
         'Benin': 5,
         'Bhutan': 1,
         'Bolivia': 5,
         'Bosnia and Herzegovina': 6,
         'Brazil': 29,
         'Bulgaria': 3,
         'Burkina Faso': 3,
         'Cambodia': 7,
         'Cameroon': 5,
         'Cape Verde': 1,
         'Central African Republic': 2,
         'Chad': 5,
         'Chile': 2,
         'China': 11,
         'Colombia': 21,
         'Congo, Democratic Republic of': 4,
         'Costa Rica': 2,
         'Croatia': 9,
         'Czech Republic': 1,
         'Denmark': 5,
         'Dominica': 1,
         'Dominican Republic': 4,
         'Ecuador': 12,
         'Egypt': 16,
         'El Salvador': 3,
         'Ethiopia': 5,
     

**Banks**

In [76]:
banks_cols = get_category_cols('Bank', ['Bank Risk Rating'])
all_banks = projects[banks_cols].as_matrix().flatten()
all_banks = [i for i  in all_banks if  pd.notnull(i)] ## Something weird with the nulls in this one. 
banks_counter = Counter(all_banks)
print(len(banks_counter))
banks_counter

13


Counter({'African Development Bank (AFDB)': 24,
         'Asian Development Bank (ADB)': 24,
         'Asian Infrastructure Investment Bank (AIIB)': 16,
         'European Bank for Reconstruction and Development (EBRD)': 66,
         'European Investment Bank (EIB)': 172,
         'Green Climate Fund (GCF)': 12,
         'Inter-American Development Bank (IADB)': 96,
         'Inter-American Investment Corporation (IIC)': 19,
         'International Finance Corporation (IFC)': 152,
         'Multilateral Investment Guarantee Agency (MIGA)': 15,
         'Netherlands Development Finance Company (FMO)': 24,
         'New Development Bank (NDB)': 3,
         'World Bank (WB)': 134})