In [4]:
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import re as re
import os
import copy

In [7]:
conflict = pd.read_csv('Data\\conflict_data\\DRC_2.csv', header = 0, index_col = 0)

aid = pd.read_csv('Data\\aid_data\\data\\level_1a.csv')
aid = aid.dropna(subset=['latitude', 'longitude']) # drop those entries that don't have coordinates to them

worker_deaths = pd.read_csv('Data\\security_incidents.csv', encoding = "ISO-8859-1")

provinces = gpd.read_file('Data\\gadm36_COD_shp/gadm36_COD_1.shp')

In [11]:
geometry = [Point(xy) for xy in zip(conflict.longitude, conflict.latitude)]
gconflict = gpd.GeoDataFrame(conflict, crs = {'init': 'epsg:4326'}, geometry = geometry)

geometry = [Point(xy) for xy in zip(aid.longitude, aid.latitude)]
gaid = gpd.GeoDataFrame(aid, crs = {'init': 'epsg:4326'}, geometry = geometry)

geometry = [Point(xy) for xy in zip(worker_deaths.Longitude, worker_deaths.Latitude)]
g_w_d = gpd.GeoDataFrame(worker_deaths, crs = {'init': 'epsg:4326'}, geometry = geometry) # g_w_d = gworker_deaths


gconflict = gpd.sjoin(gconflict, provinces, how="inner")
gaid = gpd.sjoin(gaid, provinces, how="inner")
g_w_d = gpd.sjoin(g_w_d, provinces, how="inner")


In [4]:
# calculate the number of rows there column_name has more than one entry, separated by "|". We use this for donors and aid focus. 
def calc_multiples(df, column_name):
    s = 0
    l = []
    for index, row in df.iterrows():
        entries = row[column_name].split("|")
        if len(entries) > 1:
            s += 1
            l.append(index)
    return s#, l
# split those rows that have more than one entry in a certain column name.
# We create a new row for each of the multiple entries (other entries being the same)
# and delete the original multiple entry row. 
def split_rows(data, column_name):
    df = copy.deepcopy(data)
    temp = pd.DataFrame()
    for index, row in df.iterrows():
        entries = row[column_name].split("|")
        if len(entries) > 1:
            for entry in entries:
                temp_row = row
                temp_row[column_name] = entry
                temp = temp.append(temp_row)
            df.drop(index, inplace = True)
            
    return df.append(temp)

# split the rows(projects) that have more than one donor per project
gaid = split_rows(gaid, 'donors')
gaid = split_rows(gaid, 'ad_sector_names')

In [148]:
if not os.path.exists('measures_indices'):
    os.makedirs('measures_indices')
    
# how many projects there are per donor and their share in the total, save in a csv file  
total = gaid['donors'].value_counts() # total N of projects per donor
share = gaid['donors'].value_counts() / gaid.shape[0] # calculate share of the projects per donor in total N of projects
pd.concat([total, share], axis = 1).to_csv('measures_indices\\N_projects_per_donor.csv')

# what share of projects per donor have multiple focus sectors (e.g. General environmental protection|Transport and storage)
multiple_focus_share = (gaid.groupby('donors').apply(calc_multiples, 'ad_sector_names') / gaid['donors'].value_counts())
multiple_focus_share.to_csv('measures_indices\\project_multiple_sectors.csv')

# share of each project focus in the total number of projects per donor. We use this to calculate variablity of project focus for a donor
focus_share = (gaid.groupby(['donors', 'ad_sector_names']).size() / gaid.groupby(['donors']).size())
focus_share.to_csv('measures_indices\\donor_sector_share.csv')

In [63]:
# Func calculates the coefficient of unalikeability (as defined by Kader 2007) of every sublcass of first_group variable by topic_name
# E.g. unalikeability of project location for each aid donor -> calc_unalikeability(gaid, 'donors', 'ad_sector_names')

def calc_unalikeability(data, first_group, topic_name):
    
    # prepare data: group by first_group and topic_name, and divide by the size of the respective group
    # thus we obtain the share that each topic_name has in the respective first_group
    
    d = (data.groupby([first_group, topic_name]).size() / gaid.groupby([first_group]).size())
    
    # here we get the keys for the first level grouping. So, the unique values of first group column
    keys = []
    for i in d.index:
        keys.append(i[0])
    keys = set(keys)
    
    # here we calculate the actual coefficient
    # for every value of first_group we calculate its coefficient:
    # coefficient is defined as 1 - SUM_i(p_i^2), where p_i is the share of the ith subgroup in the total group. 
    
    coefs = {}
    for key in keys:
        s = 0
        for subgroup in d[key]:
            s += subgroup ** 2
        coef = 1 - s
        coefs[key] = coef
    return pd.Series(coefs)


# calculates the variablity of topic_name (e.g. total commitments of money) for every member of first_group
def calc_var(data, first_group, topic_name):
    d = data.groupby(first_group)
    
    def var(d):
        d = d[topic_name]
        d = (d - min(d)) / (max(d) - min(d))
        
        if np.isnan(np.var(d)):
            return 0
        
        return np.var(d)
    
    return d.apply(var)

In [66]:
adaptability_by_focus = calc_unalikeability(gaid, 'donors', 'ad_sector_names')
adaptability_by_location =calc_unalikeability(gaid, 'donors', 'NAME_1')
adaptability_by_start_year = calc_unalikeability(gaid, 'donors', 'transactions_start_year')
adaptability_by_commitment = calc_var(gaid, 'donors', 'total_commitments')

#pd.DataFrame.from_dict(adaptability_by_focus, orient = 'index').to_csv('measures_indices\\adaptability_by_focus.csv')
#pd.DataFrame.from_dict(adaptability_by_location, orient = 'index').to_csv('measures_indices\\adaptability_by_location.csv')
#pd.DataFrame.from_dict(adaptability_by_start_year, orient = 'index').to_csv('measures_indices\\adaptability_by_start_year.csv')
#adaptability_by_commitment.to_csv('measures_indices\\adaptability_by_commitments.csv')
#(adaptability_by_commitment + adaptability_by_start_year + adaptability_by_location + adaptability_by_focus).to_csv('measures_indices\\composite_adaptability.csv')

In [32]:
g_w_d.groupby(['NAME_1', 'Year'])['Total affected'].sum()

NAME_1         Year
Bas-Uélé       2014     1
Haut-Katanga   2009     1
Haut-Uélé      2016     1
Ituri          2000     2
               2001     6
               2004     2
               2005     2
               2006     3
               2008     3
               2009     9
               2012     1
               2013     1
               2014     1
               2018     4
Kasaï          2015     2
Kinshasa       2010     1
               2015     1
Kongo-Central  2001     1
Maniema        2002     3
               2004     1
               2016     3
Nord-Kivu      2004     1
               2006     3
               2008     5
               2009     6
               2010     5
               2011    10
               2013    10
               2014     7
               2015    25
               2016    10
               2017     2
               2018     6
Sankuru        1997    10
               1998     1
               2000     2
               2002     1
               200

In [24]:
gconflict['year'].value_counts()

2009    547
1996    205
1999    160
2011    155
2013    149
2012    147
1997    134
2002    121
2007    117
1998    117
2014    112
2015    111
2000    109
2008    106
2016    103
2010     90
2006     80
2003     67
2001     57
2004     33
2005     25
1993     20
1995     15
1994     14
1992     10
1991      8
1990      4
1989      3
Name: year, dtype: int64

In [None]:
# TODO LIST
# 1. Divide donors into 2 groups: higly adaptable and not higly adaptable
# 2. Divide areas into two groups: high aid worker casualties and low aid worker casualties. 
# + consider the time. So, select (year, area) tuples of high/low casualties. 

# 3. Select projects based on the 4 subcategories derived above.
# Projects with: donor is in one of the first classification group.
# Start year and area is in one of the second classification groups.

#                        High adaptability              Low adaptability
#
# High receptivity           n                                 n2
#
# Low receptivity            n3                                n4



# 4. THINK ABOUT INFERENCE PROBLEM, MATCHING PROBLEM, CREATING CONTROL AND TREATMENT GROUPS.