In [15]:
import sys, os, importlib, math, multiprocessing, boto3, pickle
import rasterio, geojson

import pandas as pd
import geopandas as gpd
import numpy as np

from h3 import h3
from tqdm import tqdm
from shapely.geometry import Polygon, Point

sys.path.insert(0, "/home/wb411133/Code/gostrocks/src")
import GOSTrocks.rasterMisc as rMisc
import GOSTrocks.ntlMisc as ntl
import GOSTrocks.mapMisc as mapMisc
from GOSTrocks.misc import tPrint

sys.path.append("../src")
import h3_helper

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
input_acled_file = "/home/public/Data/GLOBAL/ACLED/2020-01-01-2023-01-13.csv"
h3_level = 6

bucket = 'wbg-geography01' 
prefix = 'Space2Stats/h3_stats_data'
attribute = "ACLED"
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region)

In [6]:
inA = gpd.read_file(input_acled_file)
h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False)

Loading pickle file h0_dictionary_of_h6_geodata_frames.pickle: it exists True


In [66]:
inA_geom = [Point(x) for x in zip(inA['longitude'].astype(float), inA['latitude'].astype(float))]
inA = gpd.GeoDataFrame(inA, geometry=inA_geom, crs=4326)

In [80]:
output_results = {}
for h0_idx, inD in h3_0_list.items():
    for lbl, curA in inA.groupby(['year',"event_type"]):
        curD = gpd.sjoin(inD, curA, how="inner")
        if curD.shape[0] > 0:
            tempD = curD
            count_d = tempD.groupby("shape_id").aggregate({'event_type':'count'}).loc[:,'event_type']
            tempD = inD.join(count_d, rsuffix='acled').fillna(0)
            inD["_".join(lbl)] = tempD['event_type']
        else:
            inD["_".join(lbl)] = tempD['event_type']
    output_results[h0_idx] = inD
    tPrint(f'Processed {h0_idx}')

10:08:48	Processed 8005fffffffffff
10:08:56	Processed 8063fffffffffff
10:09:04	Processed 806dfffffffffff
10:09:11	Processed 801dfffffffffff
10:09:19	Processed 80a7fffffffffff
10:09:27	Processed 80affffffffffff
10:09:35	Processed 809dfffffffffff
10:09:44	Processed 8083fffffffffff
10:09:53	Processed 803ffffffffffff
10:10:01	Processed 802ffffffffffff
10:10:10	Processed 805ffffffffffff
10:10:17	Processed 804ffffffffffff
10:10:24	Processed 80b1fffffffffff
10:10:33	Processed 8055fffffffffff
10:10:41	Processed 8051fffffffffff
10:10:51	Processed 8043fffffffffff
10:10:59	Processed 8027fffffffffff
10:11:08	Processed 80a3fffffffffff
10:11:15	Processed 8099fffffffffff
10:11:23	Processed 8031fffffffffff
10:11:32	Processed 8071fffffffffff
10:11:43	Processed 8033fffffffffff
10:11:53	Processed 801ffffffffffff
10:12:00	Processed 8085fffffffffff
10:12:08	Processed 8009fffffffffff
10:12:16	Processed 80cdfffffffffff
10:12:24	Processed 80d3fffffffffff
10:12:32	Processed 80f1fffffffffff
10:12:40	Processed 8

In [82]:
# concatenate all the results datasets
all_ds = [x for y, x in output_results.items()]
combo_res = pd.concat(all_ds)

In [85]:
combo_res.columns

Index(['geometry', 'shape_id', 'Battles', 'Explosions/Remote violence',
       'Protests', 'Riots', 'Strategic developments',
       'Violence against civilians', '2020_Battles',
       '2020_Explosions/Remote violence', '2020_Protests', '2020_Riots',
       '2020_Strategic developments', '2020_Violence against civilians',
       '2021_Battles', '2021_Explosions/Remote violence', '2021_Protests',
       '2021_Riots', '2021_Strategic developments',
       '2021_Violence against civilians', '2022_Battles',
       '2022_Explosions/Remote violence', '2022_Protests', '2022_Riots',
       '2022_Strategic developments', '2022_Violence against civilians',
       '2023_Battles', '2023_Explosions/Remote violence', '2023_Protests',
       '2023_Riots', '2023_Strategic developments',
       '2023_Violence against civilians'],
      dtype='object')

In [106]:
tPrint("START")
for yr in range(2020, 2024):    
    sel_cols = [x for x in combo_res.columns if str(yr) in x]
    col_names = [x.replace(f"{yr}_", "") for x in sel_cols]
    sel_data = combo_res.loc[:,sel_cols]
    sel_data.columns = col_names    
    out_s3_file = f"s3://{bucket}/{prefix}/ACLED/{yr}/ACLED_breakdown_event_type.parquet"
    sel_data.to_parquet(out_s3_file)
    tPrint(out_s3_file)
    

16:10:00	START



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  sel_data.to_parquet(out_s3_file)


16:10:04	s3://wbg-geography01/Space2Stats/h3_stats_data/ACLED/2020/ACLED_breakdown_event_type.parquet
16:10:07	s3://wbg-geography01/Space2Stats/h3_stats_data/ACLED/2021/ACLED_breakdown_event_type.parquet
16:10:11	s3://wbg-geography01/Space2Stats/h3_stats_data/ACLED/2022/ACLED_breakdown_event_type.parquet
16:10:15	s3://wbg-geography01/Space2Stats/h3_stats_data/ACLED/2023/ACLED_breakdown_event_type.parquet


In [104]:
out_s3_file

's3://wbg-geography01/Space2Stats/h3_stats_data/ACLED/2020/ACLED_breakdown_event_type.csv'