In [0]:
!pip install xarray
!pip install geopy

In [0]:
import numpy as np
import pandas as pd
import datetime as dt
import xarray as xr
import pickle

### Create "y" from survey data

In [0]:
# read in
df = pd.read_stata('/dbfs/FileStore/Myanmar_Survey_ML/data/survey/Assets_household_level.dta')

In [0]:
# read in reference grid - landscan (population data)
with open('/dbfs/FileStore/Myanmar_Survey_ML/data/geo/landscan/landscan2017_xarray.pickle', 'rb') as handle:
    da = pickle.load(handle)

In [0]:
# convert survey's lat/lon to closest lat/lon in landscan 
locations = np.array([[df.loc[i,'s0q22'], df.loc[i,'s0q23']] for i in df.index])
locs = da.sel(lon=locations[:,1], lat=locations[:,0], method='nearest')

In [0]:
# put it back in the survey data
df = pd.concat([df, pd.Series(locs.lon.values, name='lon'), pd.Series(locs.lat.values, name='lat')], axis=1)

###### changeable variables!

In [0]:
poverty_line = 1302.951
fgt_a = 0.5  # change as needed
################ CHECK CORRECT WEIGHT AND EXPENDITURE COLUMNS #######################
weight = 'hhweight'
expenditure = 'r_totex_pad_v3'

In [0]:
# those who are over the poverty line have negative numbers, all negatives are made 0 thru clip
# those who are under the poverty line have a positive number (float)
df = pd.concat([df, pd.Series((poverty_line - df[expenditure]).clip(0) / poverty_line, name='y')], axis=1)

# change month to datetime and cleaning
df.loc[:,'s0q20_mm'] = df['s0q20_mm'].apply(lambda x: dt.date(2017, int(x), 1))
d = df[['s0q20_mm', 'lat', 'lon', weight, 'y']]
d = d.rename(columns={'s0q20_mm':'time', weight:'weight'})

# convert 'poor' to 1 or fgta
d['y0'] = np.where(d["y"], 1.0, 0.0)
d['ya'] = np.where(d["y"], d["y"] ** fgt_a, 0.0)

# get weighted average by lat/lon location and month
ds = d.groupby(['time', 'lat', 'lon']).apply(
    lambda x: pd.Series(
                [
                    np.average(x['y0'], weights=x["weight"]),
                    np.average(x['ya'], weights=x["weight"]),
                ],
                index=("y0", "ya")
            )
        ).reset_index()

In [0]:
# check
ds

In [0]:
# save as pandas pickle
with open('/dbfs/FileStore/Myanmar_Survey_ML/data/survey/y_panda.pickle', 'wb') as handle:
    pickle.dump(da, handle, protocol=pickle.HIGHEST_PROTOCOL)

### ACLED feature engineering

In [0]:
# read in reference grid - landscan (population data)
with open('/dbfs/FileStore/Myanmar_Survey_ML/data/geo/landscan/landscan2017_xarray.pickle', 'rb') as handle:
    da = pickle.load(handle)

In [0]:
# read in acled data
from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils

spark = SparkSession.builder.getOrCreate()
dbutils = DBUtils(spark)

database_host = dbutils.secrets.get(scope='warehouse_scope', key='database_host')
database_port = dbutils.secrets.get(scope='warehouse_scope', key='database_port')
user = dbutils.secrets.get(scope='warehouse_scope', key='user')
password = dbutils.secrets.get(scope='warehouse_scope', key='password')

database_name = "UNDP_DW_CRD"
table = "dbo.CRD_ACLED"
url = f"jdbc:sqlserver://{database_host}:{database_port};databaseName={database_name};"

df_all = (spark.read
      .format("com.microsoft.sqlserver.jdbc.spark")
      .option("url", url)
      .option("dbtable", table)
      .option("user", user)
      .option("password", password)
      .load()
    ) 

In [0]:
# filter to year and country
df = df_all.filter((df_all.ACLED_Year==2017) & (df_all.CountryFK==187))
display(df.limit(10))

In [0]:
# convert
acled = df.toPandas()
acled = acled[['TimeFK_Event_Date', 'ACLED_Event_Type', 'ACLED_Latitude', 'ACLED_Longitude' , 'ACLED_Geo_Precision', 'ACLED_Fatalities']]
# datetime to the first of the month
acled['month'] = acled['TimeFK_Event_Date'].apply(lambda x: str(x))
acled['month'] = acled['month'].apply(lambda x: dt.datetime(int(x[:4]), int(x[4:6]), 1))

In [0]:
# convert lscn to pandas for easy looping
lscn = da.to_dataframe().reset_index()
lscn = lscn[~np.isnan(lscn.landscan)]
lscn.head()

In [0]:
from geopy.distance import geodesic
from geopy.distance import distance

col = []
for i, arow in acled.iterrows():
    # max distance (of influence) depends on the precision of the location
    if arow['ACLED_Geo_Precision'] == 1:
        mx = 5
    elif arow['ACLED_Geo_Precision'] == 2:
        mx = 20
    else:
        mx = 50

    coord1 = (arow['ACLED_Latitude'], arow['ACLED_Longitude'])
    dst = distance(kilometers=mx*1.05)
    n = dst.destination(point=coord1, bearing=0)[0] #get latitude for north-bound
    e = dst.destination(point=coord1, bearing=90)[1] #get longitude for east-bound
    s = dst.destination(point=coord1, bearing=180)[0] #get latitude for south-bound
    w = dst.destination(point=coord1, bearing=-90)[1] #get latitude for west-bound

    # filter lat-lon to reduce compute time
    lscn_filter = lscn.loc[(lscn['lat'] < n) & (lscn['lat'] > s) & (lscn['lon'] < e) & (lscn['lon'] > w), :]
    
    near = []
    # get exact distance and save if within max
    for j, wor in lscn_filter.iterrows():
        coord2 = (wor['lat'], wor['lon'])
        dist = geodesic(coord1, coord2).km
        if dist <= mx:
            near.append((wor['lat'], wor['lon']))    
    col.append(near)

# save as column
acled['lscn'] = col

In [0]:
# explode the list into separate rows
a = acled.explode('lscn')

In [0]:
# number of events, normalized by geo precision
def event_count(sub):
    far = sub[sub['ACLED_Geo_Precision']==3].shape[0] / 3
    med = sub[sub['ACLED_Geo_Precision']==2].shape[0] / 2
    near = sub[sub['ACLED_Geo_Precision']==1].shape[0]
    return far + med + near

# number of fatalities, normalized by geo precision
def fatal_count(sub):
    far = sub.loc[sub['ACLED_Geo_Precision']==3, 'ACLED_Fatalities'].sum() / 3
    med = sub.loc[sub['ACLED_Geo_Precision']==2, 'ACLED_Fatalities'].sum() / 2
    near = sub.loc[sub['ACLED_Geo_Precision']==1, 'ACLED_Fatalities'].sum()
    return far + med + near

In [0]:
# groupby lat/lon coordinates and month - get 'normalized' event counts
ev = a.groupby(['lscn','month']).apply(lambda sub: event_count(sub)).reset_index()
ev = ev.rename(columns={0:'event_count'})

In [0]:
# groupby lat/lon coordinates and month - get 'normalized' fatality counts
ft = a.groupby(['lscn','month']).apply(lambda sub: fatal_count(sub)).reset_index()
ft = ev.rename(columns={0:'fatal_count'})

In [0]:
# merge together
mrg = pd.merge(ev, ft)
mrg['lat'] = mrg['lscn'].apply(lambda x: x[0])
mrg['lon'] = mrg['lscn'].apply(lambda x: x[1])
mrg = mrg[['lat','lon','month','event_count','fatal_count']]
mrg

In [0]:
# save as pandas pickle
with open('/dbfs/FileStore/Myanmar_Survey_ML/data/survey/acled_panda.pickle', 'wb') as handle:
    pickle.dump(mrg, handle, protocol=pickle.HIGHEST_PROTOCOL)