In [1]:
import pandas as pd
import numpy as np
import datareader.cleaner.lemas as cl
import simnet.similarity
import simnet.variety

from diffusion.simple_diff import get_diffusion_diff_dynamic

from tqdm import tqdm

In [2]:
## Get the attribute data for agencies
INDEX_COL = "ORI9"
df, cat_cols, con_cols = cl.lemas_df_cols("Imputed_Lemas.csv")
df = df[~(df['ORI9']==' ')]

all_nodes = df[INDEX_COL]

## Define the similarity metric
cosine_sim_8 = simnet.similarity.CosineSimilarity(0.8)
cosine_sim_7 = simnet.similarity.CosineSimilarity(0.7)

## Get the entropy-filtered columns
cat_col_variety_full = {col: v
                   for col in cat_cols if (v:=simnet.variety.get_discrete_variety_score(df[col]))}
variety_p75 = np.percentile(list(cat_col_variety_full.values()), 50)
cat_col_variety_75 = [col for col, v in cat_col_variety_full.items() if v > variety_p75]
cols_to_include = cat_col_variety_75 + ["log_pop", "log_budget"]


In [3]:
df.set_index("ORI9")

Unnamed: 0_level_0,Unnamed: 0,LEAR_ID,AGENCYNAME,CITY,ZIPCODE,STATE,COUNTY,FIPS,POPSERVED,POPGROUP,...,ISSU_ADDR_SCH,ISSU_ADDR_SWAT,ISSU_ADDR_TERROR,ISSU_ADDR_VIC,FINALWGT,NEW_TOT_HIRES,NEW_TOT_SEP,FINALWGT_NTH_NTS,log_pop,log_budget
ORI9,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA0570100,0,635592,DAVIS POLICE DEPARTMENT,DAVIS,95618,CA,YOLO,6113,68111,5,...,3,2,3,2,7.185185,5.703704,3.629630,7.533981,0.660713,0.783248
NJ0091200,1,645110,WEST NEW YORK POLICE,WEST NEW YORK,7093,NJ,HUDSON,34017,53343,5,...,3,2,2,3,1.131206,31.629433,26.578014,1.183673,0.546056,0.674355
WV0310400,2,631270,WESTOVER POLICE DEPARTMENT,WESTOVER,26501,WV,MONONGALIA,54061,4243,8,...,5,5,5,5,7.304746,1.027363,0.634328,7.607534,-0.641567,-0.694089
WI0570200,3,631316,BARABOO POLICE DEPARTMENT,BARABOO,53913,WI,SAUK,55111,12173,7,...,1,2,3,3,6.913793,3.047414,2.344828,7.225225,-0.147114,-0.001664
WI0137400,4,631684,OREGON POLICE DEPARTMENT,OREGON,53575,WI,DANE,55025,3334,8,...,2,5,5,3,6.978166,1.711790,1.235808,7.347126,-0.754676,-0.207605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NV0020100,2779,645361,LAS VEGAS METRO POLICE DEPARTMENT,LAS VEGAS,89106,NV,CLARK,32003,1592178,1,...,4,1,1,2,1.131206,354.000000,216.000000,1.183673,2.139320,2.334349
CA0190000,2780,635749,LOS ANGELES COUNTY SHERIFF'S DEPARTMENT,MONTEREY PARK,91754,CA,LOS ANGELES,6037,10137915,1,...,2,1,1,2,1.321033,460.000000,1694.000000,1.415020,3.007789,3.131008
ILCPD0000,2781,638583,CHICAGO POLICE DEPARTMENT,CHICAGO,60653,IL,COOK,17031,2704958,1,...,2,1,1,2,1.131206,595.000000,506.000000,1.183673,2.387959,2.763773
CA0194200,2782,635751,LOS ANGELES POLICE DEPARTMENT,LOS ANGELES,90012,CA,,6037,3976322,1,...,2,1,1,2,1.131206,760.000000,716.000000,1.183673,2.568706,2.782484


In [4]:
## Get the State dynamic
nibrs_df = pd.read_parquet("DATA/map_plot.pq")

## Filter those that are within the attribute data
dynamic = nibrs_df[nibrs_df["ORI"].isin(all_nodes)].assign(y = nibrs_df["NUMBER OF MONTHS REPORTED"]>0) \
    .pivot_table(index="ORI", columns="year", values="y", aggfunc='first').fillna(False)
dynamic

  .pivot_table(index="ORI", columns="year", values="y", aggfunc='first').fillna(False)


year,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
ORI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK0010100,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AK0010300,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AK0011600,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
AK0015600,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
AL0010000,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WY0150100,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,True
WY0190000,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
WY0210000,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
WY0210100,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,True


In [5]:
pre_fund = dynamic.loc[:, :2015]
post_fund = dynamic.loc[:, 2015:]

In [6]:
common_agencies = sorted(set(dynamic.index) & set(df["ORI9"]))
df= df.set_index("ORI9").loc[common_agencies]

In [7]:
list(df.index) == list(dynamic.index)

True

In [8]:
# get diff by get_diffusion_diff_dynamic for pre_2015 using realization-markov chain
# get diff for post_2015 using prediction-markov chain

sm = simnet.similarity.SimilarityNetwork(
    df = df,
    similarity_measure= cosine_sim_8,
    match_columns=cols_to_include
)


In [10]:

W = sm.fit_transform()

get_diffusion_diff_dynamic(W, dynamic.to_numpy(dtype=np.float_), 0.5)

array([ 47., 147.,  33., 148.,  38.,  79.,  69.,  89.,  70., 120.,  82.,
        95., 114.,  82.,  65.,  70.,  86.,  93.,  60.,  65.,  78.,  74.,
        63.,  73.,  78., 123., 165., 183., 274., 358.])

In [13]:
from diffusion.simple_diff import prediction_markov_diffusion_diff_dynamic

prediction_markov_diffusion_diff_dynamic(W,post_fund.to_numpy(dtype=np.float_), 0.5)

array([   0.,  980., 1022., 1164., 1300., 1553., 1761.])