In [1]:
%load_ext autoreload
%autoreload 2
%cd C:\MAD4AG
%matplotlib inline

C:\MAD4AG


In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from scipy.spatial import distance
import pyproj as proj


import warnings

warnings.filterwarnings('ignore')

  shapely_geos_version, geos_capi_version_string


## filter ppl by
- participating work activity
- county
- urban/rural
- commute distance (is it above or below the median in the survey)
- avg trip distance

In [3]:
file_name = f'./dbs/intermediate/stops_1_new.parquet'


df_clusters = pd.read_parquet(file_name)
df_clusters = df_clusters[df_clusters.holiday_s != 1]
df_clusters = df_clusters[df_clusters.weekday_s == 1]
df_clusters = df_clusters.drop(['holiday_s', 'weekday_s'], axis=1)

df_clusters.drop_duplicates(subset=['uid', 'cluster'], keep='first', inplace=True)

In [4]:
# read home clusters


df_h = pd.read_parquet(f'./dbs/intermediate/home_inference.parquet')
df_h.drop_duplicates(subset='uid', keep='first', inplace=True)

df_h['home_potential'] = 1

In [5]:
# read work clusters, keep only ppl having home locations

df_w = pd.read_parquet(f'./dbs/intermediate/work_inference.parquet')
df_w.drop_duplicates(subset='uid', keep='first', inplace=True)

In [6]:
# keep ppl having home locations

df_clusters = df_clusters[['uid', 'cluster', 'cluster_lat', 'cluster_lng']]

df_clusters = df_clusters[df_clusters.uid.isin(df_h.uid.unique())]

In [7]:
# add home information to clusters

df_clusters = pd.merge(df_clusters, df_h[['uid', 'cluster','home_potential' ]], on=['uid','cluster'], how='left' )

df_clusters.rename(columns={'home_potential':'act_type'}, inplace=True)

df_clusters['act_type'] = df_clusters.act_type.replace(1, 'home')

In [8]:
# add work information to clusters


df_clusters = pd.merge(df_clusters, df_w[['uid', 'cluster','work_potential' ]], on=['uid','cluster'], how='left' )

df_clusters['act_type'][df_clusters.work_potential==1.00000]='work'

df_clusters.drop(['work_potential'], axis=1, inplace=True)

df_clusters['act_type'] = df_clusters['act_type'].fillna('other')


## calculate the number of other clusters and the average distance between the home and all activity clusters.

In [9]:
# setup your projections
crs_wgs = proj.Proj(init='epsg:4326') # assuming you're using WGS84 geographic
crs_bng = proj.Proj(init='epsg:3006') # use a locally appropriate projected CRS

# then cast your geographic coordinate pair to the projected system
df_clusters['X'], df_clusters['Y']  = proj.transform(crs_wgs, crs_bng, df_clusters.cluster_lng.values, df_clusters.cluster_lat.values)

In [10]:
df_clusters.to_parquet(f'./dbs/intermediate/df_selected_clusters.parquet')

In [11]:
# the number of detected others for each individual

df_clusters_other_count = df_clusters.groupby('uid')['act_type'].apply(lambda x: (x=='other').sum()).reset_index(name='other_count')

In [12]:
def avg_dist(data):


    avg_dist = distance.cdist(data[['X', 'Y']][data.act_type=='home'], data[['X', 'Y']][data.act_type!='home'], metric='euclidean')

    avg_dist = np.median(avg_dist, axis=1)


    return pd.Series(dict(avg_dist=float(avg_dist)))

In [13]:
tqdm.pandas()
df_clusters_avg_dist = df_clusters.groupby('uid').progress_apply(avg_dist).reset_index()

  0%|          | 0/280996 [00:00<?, ?it/s]

## calculate the commuting distance between the home and all activity clusters.

In [14]:
def com_dist(data):


    dist_data = distance.cdist(data[['X', 'Y']][data.act_type=='home'], data[['X', 'Y']][data.act_type=='work'], metric='euclidean')

    dist_data = np.mean(dist_data, axis=1)

    return pd.Series(dict(com_dist=float(dist_data)))

In [15]:
tqdm.pandas()
df_clusters_com_dist = df_clusters.groupby('uid').progress_apply(com_dist).reset_index()

  0%|          | 0/280996 [00:00<?, ?it/s]

In [24]:
df_act = pd.read_parquet(f'./dbs/intermediate/indi_weights.parquet')
#df_act = pd.read_parquet(f'./dbs/intermediate/indi_weights_no_trim.parquet')

#df_act = df_act.drop(columns=['avg_dist','com_dist'])

df_act['county']= df_act['Deso'].str[:2]

df_act['urban_density']= df_act['Deso'].str[4]

In [25]:
df_act = pd.merge(df_act, df_clusters_other_count, on='uid' )
df_act = pd.merge(df_act, df_clusters_avg_dist, on='uid' )
df_act = pd.merge(df_act,  df_clusters_com_dist, on='uid')

In [26]:
df_act.dropna(subset=['avg_dist'], inplace=True)

In [27]:
# save the table
df_act = df_act[['uid', 'work_potential', 'Deso', 'county', 'urban_density', 'other_count', 'avg_dist', 'com_dist', 'wt', 'wt_p']]

df_act.rename(columns={'work_potential':"commute"},inplace=True)

In [28]:

df_act.to_parquet(f'./dbs/intermediate/indi_weights_distance.parquet')
#df_act.to_parquet(f'./dbs/intermediate/indi_weights_no_trim.parquet')
