# Step 1: Migration Detection

This script was designed for use in Project Artemis. The objective was to try to identify instances of migration in the SafeGraph data, which could then be correlated with past famines, and relationships identified. 

In this script, we work through SafeGraph record summaries to try to define a 'home range' for each user. Large movements in this home range indicate migration. This process generates the data layer, which can the be passed to the visualization script for finalization. 

Library imports

In [7]:
import pandas as pd
import numpy as np 
import shapely 
import geopandas as gpd
import sys, os, time
from shapely.geometry import Point
from shapely.geometry import MultiPolygon

Settings

In [8]:
basepath = r'C:\Users\charl\Documents\GOST\SafeGraph'
datapath = r'C:\Users\charl\Documents\GOST\SafeGraph\SafeGraph'
WGS84 = {'init' :'epsg:4326'}

Generate a list of files by walking through the datapath

In [9]:
for root, dirs, files in os.walk(datapath, topdown=False):
    pass

Optimised list flattening function

In [10]:
def flatten(l, ltypes=(list, tuple)):
    ltype = type(l)
    l = list(l)
    i = 0
    while i < len(l):
        while isinstance(l[i], ltypes):
            if not l[i]:
                l.pop(i)
                i -= 1
                break
            else:
                l[i:i + 1] = l[i]
        i += 1
    return ltype(l)

Main function to be run against input dataframes. See in-line comments for details

In [13]:
def Main(fil, i, hr_daythresh, hr_distthresh, migr_dist_thresh):    
    
    # read input .csv
    df = pd.read_csv(os.path.join(root, fil))
    
    # subset to selected column
    df = df[['latitude','longitude','horizontal_accuracy','utc_timestamp']]
    
    # convert UNIX datetime object
    df['date'] = pd.to_datetime(df['utc_timestamp'], unit='s')
    
    # format as day, month
    df['date'] = df['date'].dt.strftime('%d.%m')
    
    # drop all observations where horizonatal accuracy is less than 100m
    df = df.loc[df.horizontal_accuracy < 100]
    
    # generate geomtry objects from longitude and latitude columns
    df['Point'] = df.apply(lambda x: Point(x.longitude, x.latitude), axis = 1)
    
    # grouping function. extracts specific records from passed in dataframe, returns new df.
    def grouper(x):
        y = pd.DataFrame()
        y['minx'] = [x.longitude.min()]
        y['maxx'] = [x.longitude.max()]
        y['miny'] = [x.latitude.min()]
        y['maxy'] = [x.latitude.max()]
        y['Pointbag'] = [list(x.Point)]
        return y
    
    # from the larger dataframe, create a dataframe which summarises by day. One row per each day. 
    daydf = df.groupby('date').apply(lambda x: grouper(x))
    
    # generate a unique ID for each person
    daydf['ID'] = 'person_%s' % i
    
    # reset index 
    daydf = daydf.reset_index()
    
    # break out the day from the date string
    daydf['day'] = daydf['date'].apply(lambda x: int(x.split('.')[0]))
    
    # calculate a rough centroid
    daydf['centroid'] = daydf.apply(lambda x: Point(((x['minx'] + x['maxx']) / 2, (x['miny'] + x['maxy']) / 2)), axis = 1)
    
    # note - not yet projected - dummy column which is reprojected later
    daydf['centroid_utm'] = daydf['centroid']
    
    ### Calculate daily displacement of centroid in metres 
    # identify correct UTM zone for the median long and lat point
    EPSG = 32700-round((45+df.longitude.median())/90,0)*100+round((183+df.latitude.median())/6,0)
    epsg = {'init' :'epsg:%s' % int(EPSG)}
    
    # generate geodataframe
    gdaydf = gpd.GeoDataFrame(daydf, geometry = 'centroid_utm', crs = WGS84)
    
    # project geodataframe to UTM 
    gdaydf = gdaydf.to_crs(epsg)
    
    # generate daily change in centroid column ('Disp') - (today  - yesterday)
    gdaydf['prev_centroid_utm'] = gdaydf['centroid_utm'].shift(periods = 1).fillna(gdaydf['centroid_utm'])
    gdaydf['Disp'] = gdaydf.apply(lambda x: x.centroid_utm.distance(x.prev_centroid_utm), axis = 1)
    
    # Here we get to the meat of defining a home range.
    # We break points into homerange blocks - when days consecutive and movement of centroid are below threshold.
    # first value is window size, min_periods prevents smaller windows from occuring
    # this is effectively a 'trigger' function whose value changes if the sum is not dividible precisely by the window size.
    gdaydf['consec_block'] = gdaydf.rolling(hr_daythresh, min_periods = hr_daythresh).day.sum().fillna(0)  
    gdaydf['consec_block'] = gdaydf['consec_block'].mask(gdaydf['consec_block'] % hr_daythresh == 0, 1).mask(gdaydf['consec_block'] % hr_daythresh != 0, 0).astype(int)
    
    # if displacement is less than the daily movement threshold, return 1, if not, 0
    gdaydf['Dispmask'] = gdaydf['Disp'].mask(gdaydf['Disp'] < hr_distthresh, 1).mask(gdaydf['Disp'] > hr_distthresh, 0).astype(int)
    
    # add together our distance flag and our consecutive block flag to a new series called 'eligible'
    gdaydf['eligible'] = (gdaydf['Dispmask'] + gdaydf['consec_block'])
    
    # convert these back to a binary mask (2 goes to 1, not 2 goes to 0)
    gdaydf['eligible'] = gdaydf['eligible'].mask(gdaydf['eligible'] == 2, 1).mask(gdaydf['eligible'] != 2, 0)
    
    # here we generate a flag function for when eligible blocks change
    x = list(gdaydf['eligible'])
    y = np.insert(x, 0, 0)
    res = (np.diff(y) == 1).cumsum() * x
    gdaydf['flag'] = res

    # Define shape of homerange(s)
    num_homeranges = len(gdaydf['flag'].unique())
    
    # set up some blank variables
    start_loc, end_loc = None, None
    detection = 0 
    
    # create result for being unable to establish homerange:
    if num_homeranges == 0: # prev 1
        
        status = 'unable to establish homerange',
        
    else:
        
        # create empty list for homeranges
        homeranges = []
        
        # copy over df
        hom = gdaydf.copy()
        
        # for each homerange
        for j in range(1, num_homeranges):
            
            # create a new homerange df called cur_hom
            cur_hom = hom.loc[hom['flag'] == j]
            
            # append the convex hull of the points in that homerange
            homeranges.append(gpd.GeoSeries(flatten(list(cur_hom.Pointbag))).unary_union.convex_hull)
    
        # If only one home range, then by definition no migration
        if len(homeranges) <= 1:
            
            status = '1 homerange established, no migration'
        
        # if we are in this case, then we have detected at least 2 home ranges - exciting! 
        else:

            # check if homeranges are distant by calculating the distance between homerange centroid
            # if this distance is larger than 'migr_dist_thresh', we have detected migration in this model:
            homeranges = pd.DataFrame({'geometry' : homeranges})
            homeranges = gpd.GeoDataFrame(homeranges, crs = WGS84, geometry = 'geometry')
            homeranges['hr_cent'] = homeranges.centroid
            homeranges['prev_hr_cent'] = homeranges['hr_cent'].shift(periods = 1).fillna(homeranges['hr_cent'])
            homeranges = homeranges.to_crs(epsg)
            homeranges['hr_cent_epsg'] = homeranges.centroid
            homeranges['prev_hr_cent_epsg'] = homeranges['hr_cent_epsg'].shift(periods = 1).fillna(homeranges['hr_cent_epsg'])
            homeranges['Disp'] = homeranges.apply(lambda x: x.hr_cent_epsg.distance(x.prev_hr_cent_epsg), axis = 1)
            
            # if we are in this case, we have at least 2 homeranges, but they aren't far enough 
            # away for migration to be declared with certainty:
            if homeranges.Disp.max() <= migr_dist_thresh:

                status = 'homerange(s) established, no migration',
            
            # if we are here, then migration has indeed been detected - the distance between homerange centroids exceeds
            # our threshold distance for delcaring a migration
            else:

                migr_homeranges = homeranges.copy()
                migr_homeranges = migr_homeranges.loc[homeranges.Disp > migr_dist_thresh]
                
                migr_homeranges['status'] = 'migration detected'
                migr_homeranges['file'] = fil
                migr_homeranges['person'] = 'person_%s' % i
                migr_homeranges = migr_homeranges.rename(columns = {'hr_cent':'end_loc','prev_hr_cent':'start_loc','Disp':'distance'})
                
                detection = 1
    
    # we only want to retun the dataframe migr_homeranges if detection detected
    if detection == 1:
        return migr_homeranges
    
    # otherwise, we return a very simple dataframe, with the non-migration detection results. 
    else:
        result = pd.DataFrame({'person': 'person_%s' % i,
                           'file':fil,
                           'status':status,
             }, index = [1])
        return result

Having defined this fairly complex process for identifying migration via material shifts in homeranges, we apply this to each file in the files list object generated at the start of the script:

In [14]:
outs = []
hr_daythresh = 5  # minimum length of time for a homerange to be declared
hr_distthresh = 1000  # parameter for homerange distances 
migr_dist_thresh = 50000 # migration threshold distance - 50km here

# process each file in the list, add to the 'outs' list
i = 0
for fil in files:
    outs.append(Main(fil, i, hr_daythresh, hr_distthresh, migr_dist_thresh))
    i+=1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]


We prepare to output this datafile to disk, for visualization in the next script

In [None]:
# concatenate the outs list into a dataFrame, confusingly also called 'out'
out = pd.concat(outs)
out = out.reset_index()

# choose a subset of columns to output, send to file
out = out[['person','status','start_loc','end_loc','distance','file']]
out.to_csv(os.path.join(basepath, 'Output', 'output.csv'))