# k-anonymity test

## Import packages

In [34]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

## Zip dataset

In [2]:
santa_clara_zip_data_file = "/home/yj/privacy/COVID-19_cases_by_zip_code_of_residence.csv"
santa_clara_df = pd.read_csv(santa_clara_zip_data_file)

In [3]:
santa_clara_df.head()

Unnamed: 0,zipcode,Cases,Population,Rate
0,94022,1307,19378,6745
1,94024,1475,23961,6156
2,94040,3272,35845,9128
3,94041,1352,14394,9393
4,94043,2690,31488,8543


In [5]:
santa_clara_df.sort_values(by="Cases", ascending=False)

Unnamed: 0,zipcode,Cases,Population,Rate
43,95127,14065,65686,21412
38,95122,13923,57780,24097
29,95111,13822,62392,22153
17,95020,13553,63852,21226
32,95116,12446,56481,22036
39,95123,10995,67186,16365
30,95112,10624,61060,17399
20,95035,8962,77562,11555
21,95037,8199,51652,15874
41,95125,7702,53574,14376


In [8]:
covariance = np.cov(santa_clara_df["Population"], santa_clara_df["Cases"])
covariance

array([[4.06505936e+08, 6.38740088e+07],
       [6.38740088e+07, 1.50080138e+07]])

In [30]:
# significant, correlated
correlation_pop_case, p_value_pop_case = pearsonr(santa_clara_df["Population"], santa_clara_df["Cases"])
correlation_pop_case, p_value_pop_case

(0.8177661290076034, 8.280425334471683e-15)

In [31]:
# not significant, not correlated
correlation_pop_rate, p_value_pop_rate = pearsonr(santa_clara_df["Population"], santa_clara_df["Rate"])
correlation_pop_rate, p_value_pop_rate

(0.10665739624813109, 0.4297251563826881)

In [13]:
santa_clara_zip_over_time_data_file = "/home/yj/privacy/COVID-19_cases__tests_and_positivity_rate_over_time_by_zip_code.csv"
santa_clara_over_time_df = pd.read_csv(santa_clara_zip_over_time_data_file)

In [14]:
santa_clara_over_time_df.head()

Unnamed: 0,zcta,time_period,population,Start date,end date,case count,test count,positive tests,case rate,test rate,positivity rate
0,95125,50,53155,10/24/2021,11/13/2021,125.0,8285.0,157.0,11.2,742.2,1.9
1,94086,38,50477,02/14/2021,03/06/2021,74.0,5457.0,87.0,7.0,514.8,1.6
2,95120,53,38122,12/26/2021,01/15/2022,1352.0,11487.0,1491.0,168.9,1434.9,13.0
3,95037,46,51994,08/01/2021,08/21/2021,364.0,6657.0,410.0,33.3,609.7,6.2
4,95008,49,46352,10/03/2021,10/23/2021,60.0,6873.0,69.0,6.2,706.1,1.0


In [17]:
# data cleaning
santa_clara_over_time_df = santa_clara_over_time_df.fillna(0)

In [18]:
correlation_case_test, _ = pearsonr(santa_clara_over_time_df["case count"], santa_clara_over_time_df["test count"])
correlation_case_test

0.6131917644022398

In [22]:
len(set(santa_clara_df["zipcode"])), len(santa_clara_df["zipcode"])

(57, 57)

In [24]:
# merge two data
combined_santa_clara_df = santa_clara_over_time_df.merge(santa_clara_df, how="inner", left_on="zcta", right_on="zipcode")

In [25]:
combined_santa_clara_df.head()

Unnamed: 0,zcta,time_period,population,Start date,end date,case count,test count,positive tests,case rate,test rate,positivity rate,zipcode,Cases,Population,Rate
0,95125,50,53155,10/24/2021,11/13/2021,125.0,8285.0,157.0,11.2,742.2,1.9,95125,7702,53574,14376
1,95125,48,53155,09/12/2021,10/02/2021,119.0,8929.0,154.0,10.7,799.9,1.7,95125,7702,53574,14376
2,95125,41,53155,04/18/2021,05/08/2021,32.0,6563.0,48.0,2.9,587.9,0.7,95125,7702,53574,14376
3,95125,37,53155,01/24/2021,02/13/2021,266.0,7780.0,323.0,23.8,697.0,4.2,95125,7702,53574,14376
4,95125,42,53155,05/09/2021,05/29/2021,17.0,5262.0,32.0,1.5,471.4,0.6,95125,7702,53574,14376


In [None]:
# why population not matched

In [27]:
# Case count over time from Nov 2020 while aggregated case from July 2020
aggregated_santa_clara_df = santa_clara_over_time_df[["zcta", "case count"]].groupby("zcta").agg({"case count": "sum"})
aggregated_combined_santa_clara_df = aggregated_santa_clara_df.merge(santa_clara_df, how="inner", left_on="zcta", right_on="zipcode")

In [28]:
aggregated_combined_santa_clara_df.head()

Unnamed: 0,case count,zipcode,Cases,Population,Rate
0,1243.0,94022,1307,19378,6745
1,1398.0,94024,1475,23961,6156
2,3228.0,94040,3272,35845,9128
3,1270.0,94041,1352,14394,9393
4,2656.0,94043,2690,31488,8543


In [29]:
# data collected seaparately?
aggregated_combined_santa_clara_df[aggregated_combined_santa_clara_df["case count"] >= aggregated_combined_santa_clara_df["Cases"]]

Unnamed: 0,case count,zipcode,Cases,Population,Rate
15,0.0,95013,0,77,0
56,7716.0,95148,0,48854,0


In [33]:
# not significant, not correlated
correlation_tc_rate, p_value_tc_rate = pearsonr(combined_santa_clara_df["test count"], combined_santa_clara_df["Rate"])
correlation_tc_rate, p_value_tc_rate

(3.3044560629477227e-05, 0.9988207410807168)

In [38]:
# significant, not correlated
correlation_zip_case, p_value_zip_case = spearmanr(combined_santa_clara_df["zipcode"], combined_santa_clara_df["Cases"], axis=0, nan_policy='propagate', alternative='two-sided')
correlation_zip_case, p_value_zip_case

(0.18823926190119483, 1.9721701379949348e-17)

In [39]:
# significant, not correlated
correlation_zip_caser, p_value_zip_caser = spearmanr(combined_santa_clara_df["zipcode"], combined_santa_clara_df["case rate"], axis=0, nan_policy='propagate', alternative='two-sided')
correlation_zip_caser, p_value_zip_caser

(0.08780836589749255, 8.319570398944456e-05)

In [40]:
# not significant, not correlated
correlation_zip_testc, p_value_zip_testc = spearmanr(combined_santa_clara_df["zipcode"], combined_santa_clara_df["test count"], axis=0, nan_policy='propagate', alternative='two-sided')
correlation_zip_testc, p_value_zip_testc

(-0.027755899904777552, 0.2143561444204144)

In [41]:
# significant, correlated
correlation_case_testc, p_valu_case_testc = spearmanr(combined_santa_clara_df["Cases"], combined_santa_clara_df["test count"], axis=0, nan_policy='propagate', alternative='two-sided')
correlation_case_testc, p_valu_case_testc

(0.44940124575144974, 3.738023606329694e-100)

In [42]:
# significant, correlated
correlation_caser_testr, p_valu_caser_testr = spearmanr(combined_santa_clara_df["case rate"], combined_santa_clara_df["test rate"], axis=0, nan_policy='propagate', alternative='two-sided')
correlation_caser_testr, p_valu_caser_testr

(0.6744557429588066, 5.942960671915649e-266)

In [None]:
# area sqm^2 for population

## SF characteristics dataset

There's no ratio as to the total population. Difficult to know the absolute number indicates better or worse.

In [7]:
sf_data_file = "/home/yj/privacy/COVID-19_Deaths_by_Population_Characteristics_Over_Time.csv"
sf_df = pd.read_csv(sf_data_file)

quasi_identifiers = ["Characteristic Group"]

### k = 2

In [32]:
k = 2

sample_df = sf_df.groupby(quasi_identifiers).apply(lambda x: x.sample(k, replace=False)).reset_index(drop=True)

In [16]:
set(sample_df["Characteristic Type"])

{'Age Group',
 'Comorbidities',
 'Gender',
 'Homelessness',
 'Race/Ethnicity',
 'Sexual Orientation',
 'Single Room Occupancy Tenancy',
 'Skilled Nursing Facility Occupancy',
 'Transmission Type'}

In [34]:
sample_df[sample_df["Characteristic Type"]=="Age Group"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
0,05/27/2021 12:00:00 AM,Age Group,0-4,1,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:11 AM
1,11/21/2020 12:00:00 AM,Age Group,0-4,1,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:11 AM
2,09/26/2021 12:00:00 AM,Age Group,12-17,3,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:00:53 AM
3,04/22/2020 12:00:00 AM,Age Group,12-17,3,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:00:51 AM
4,06/10/2020 12:00:00 AM,Age Group,18-20,4,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:00:51 AM
5,11/30/2020 12:00:00 AM,Age Group,18-20,4,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:00:52 AM
6,05/28/2020 12:00:00 AM,Age Group,21-24,5,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:00:53 AM
7,09/08/2020 12:00:00 AM,Age Group,21-24,5,0,2,01/30/2022 04:30:02 AM,01/30/2022 07:00:53 AM
8,04/16/2021 12:00:00 AM,Age Group,25-29,6,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:00:54 AM
9,12/27/2021 12:00:00 AM,Age Group,25-29,6,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:00:55 AM


In [35]:
sample_df[sample_df["Characteristic Type"]=="Comorbidities"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
54,01/14/2021 12:00:00 AM,Comorbidities,,1,0,10,01/30/2022 04:30:02 AM,01/30/2022 07:01:00 AM
55,07/31/2021 12:00:00 AM,Comorbidities,,1,0,19,01/30/2022 04:30:02 AM,01/30/2022 07:01:01 AM
62,04/22/2021 12:00:00 AM,Comorbidities,One or More,2,0,353,01/30/2022 04:30:02 AM,01/30/2022 07:01:21 AM
63,08/08/2021 12:00:00 AM,Comorbidities,One or More,2,0,372,01/30/2022 04:30:02 AM,01/30/2022 07:01:21 AM
74,08/31/2021 12:00:00 AM,Comorbidities,Unknown,3,0,205,01/30/2022 04:30:02 AM,01/30/2022 07:00:59 AM


In [36]:
sample_df[sample_df["Characteristic Type"]=="Gender"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
34,08/08/2020 12:00:00 AM,Gender,Female,1,0,26,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
35,02/23/2021 12:00:00 AM,Gender,Female,1,0,202,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
46,05/13/2020 12:00:00 AM,Gender,Male,2,0,29,01/30/2022 04:30:02 AM,01/30/2022 07:01:00 AM
47,12/18/2021 12:00:00 AM,Gender,Male,2,0,412,01/30/2022 04:30:02 AM,01/30/2022 07:01:01 AM
64,10/21/2020 12:00:00 AM,Gender,Other,5,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
70,07/21/2021 12:00:00 AM,Gender,Trans Female,3,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:04 AM
71,07/01/2021 12:00:00 AM,Gender,Trans Female,3,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:04 AM
72,08/30/2021 12:00:00 AM,Gender,Trans Male,4,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:22 AM
73,09/02/2021 12:00:00 AM,Gender,Trans Male,4,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:22 AM


In [37]:
sample_df[sample_df["Characteristic Type"]=="Homelessness"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
44,01/31/2021 12:00:00 AM,Homelessness,Homeless,1,0,2,01/30/2022 04:30:02 AM,01/30/2022 07:01:06 AM
45,01/17/2021 12:00:00 AM,Homelessness,Homeless,1,0,2,01/30/2022 04:30:02 AM,01/30/2022 07:01:06 AM
56,10/13/2021 12:00:00 AM,Homelessness,Not Homeless,2,0,654,01/30/2022 04:30:02 AM,01/30/2022 07:01:24 AM
57,12/13/2021 12:00:00 AM,Homelessness,Not Homeless,2,1,679,01/30/2022 04:30:02 AM,01/30/2022 07:01:24 AM


In [38]:
sample_df[sample_df["Characteristic Type"]=="Race/Ethnicity"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
24,05/27/2020 12:00:00 AM,Race/Ethnicity,Asian,1,0,24,01/30/2022 04:30:02 AM,01/30/2022 07:01:07 AM
25,01/15/2021 12:00:00 AM,Race/Ethnicity,Asian,1,1,125,01/30/2022 04:30:02 AM,01/30/2022 07:01:08 AM
28,07/01/2020 12:00:00 AM,Race/Ethnicity,Black or African American,2,0,5,01/30/2022 04:30:02 AM,01/30/2022 07:01:09 AM
29,04/19/2021 12:00:00 AM,Race/Ethnicity,Black or African American,2,0,46,01/30/2022 04:30:02 AM,01/30/2022 07:01:09 AM
42,11/04/2020 12:00:00 AM,Race/Ethnicity,"Hispanic or Latino/a, all races",3,0,38,01/30/2022 04:30:02 AM,01/30/2022 07:01:12 AM
43,07/07/2021 12:00:00 AM,Race/Ethnicity,"Hispanic or Latino/a, all races",3,0,120,01/30/2022 04:30:02 AM,01/30/2022 07:01:12 AM
48,03/15/2021 12:00:00 AM,Race/Ethnicity,Multi-racial,7,0,3,01/30/2022 04:30:02 AM,01/30/2022 07:01:13 AM
49,02/19/2021 12:00:00 AM,Race/Ethnicity,Multi-racial,7,0,3,01/30/2022 04:30:02 AM,01/30/2022 07:01:13 AM
50,01/22/2021 12:00:00 AM,Race/Ethnicity,Native American,4,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:14 AM
51,12/03/2021 12:00:00 AM,Race/Ethnicity,Native American,4,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:14 AM


In [39]:
sample_df[sample_df["Characteristic Type"]=="Sexual Orientation"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
26,05/01/2021 12:00:00 AM,Sexual Orientation,Bisexual,1,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:26 AM
27,12/21/2020 12:00:00 AM,Sexual Orientation,Bisexual,1,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:26 AM
32,11/09/2021 12:00:00 AM,Sexual Orientation,Declined,4,0,5,01/30/2022 04:30:02 AM,01/30/2022 07:01:27 AM
33,08/27/2021 12:00:00 AM,Sexual Orientation,Declined,4,0,5,01/30/2022 04:30:02 AM,01/30/2022 07:01:27 AM
38,11/11/2020 12:00:00 AM,Sexual Orientation,Gay or Lesbian,2,0,1,01/30/2022 04:30:02 AM,01/30/2022 07:01:04 AM
39,01/05/2021 12:00:00 AM,Sexual Orientation,Gay or Lesbian,2,0,1,01/30/2022 04:30:02 AM,01/30/2022 07:01:05 AM
40,01/22/2022 12:00:00 AM,Sexual Orientation,Heterosexual,3,0,107,01/30/2022 04:30:02 AM,01/30/2022 07:01:07 AM
41,12/20/2021 12:00:00 AM,Sexual Orientation,Heterosexual,3,0,107,01/30/2022 04:30:02 AM,01/30/2022 07:01:07 AM
76,09/21/2021 12:00:00 AM,Sexual Orientation,Unsure,7,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:16 AM
77,07/01/2021 12:00:00 AM,Sexual Orientation,Unsure,7,0,0,01/30/2022 04:30:02 AM,01/30/2022 07:01:16 AM


In [40]:
sample_df[sample_df["Characteristic Type"]=="Single Room Occupancy Tenancy"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
58,07/29/2021 12:00:00 AM,Single Room Occupancy Tenancy,Not a Single Room Occupancy Tenant,2,0,550,01/30/2022 04:30:02 AM,01/30/2022 07:01:09 AM
59,04/24/2020 12:00:00 AM,Single Room Occupancy Tenancy,Not a Single Room Occupancy Tenant,2,0,26,01/30/2022 04:30:02 AM,01/30/2022 07:01:07 AM
66,08/25/2020 12:00:00 AM,Single Room Occupancy Tenancy,Single Room Occupancy Tenant,1,0,4,01/30/2022 04:30:02 AM,01/30/2022 07:01:17 AM
67,11/29/2021 12:00:00 AM,Single Room Occupancy Tenancy,Single Room Occupancy Tenant,1,0,42,01/30/2022 04:30:02 AM,01/30/2022 07:01:17 AM


In [41]:
sample_df[sample_df["Characteristic Type"]=="Skilled Nursing Facility Occupancy"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
60,06/02/2020 12:00:00 AM,Skilled Nursing Facility Occupancy,Not a Skilled Nursing Facility Occupant,2,0,31,01/30/2022 04:30:02 AM,01/30/2022 07:01:29 AM
61,09/04/2021 12:00:00 AM,Skilled Nursing Facility Occupancy,Not a Skilled Nursing Facility Occupant,2,2,490,01/30/2022 04:30:02 AM,01/30/2022 07:01:30 AM
68,07/09/2020 12:00:00 AM,Skilled Nursing Facility Occupancy,Skilled Nursing Facility Occupant,1,0,18,01/30/2022 04:30:02 AM,01/30/2022 07:01:18 AM
69,11/10/2021 12:00:00 AM,Skilled Nursing Facility Occupancy,Skilled Nursing Facility Occupant,1,0,130,01/30/2022 04:30:02 AM,01/30/2022 07:01:18 AM


In [42]:
sample_df[sample_df["Characteristic Type"]=="Transmission Type"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
30,11/16/2020 12:00:00 AM,Transmission Type,Community,1,0,91,01/30/2022 04:30:02 AM,01/30/2022 07:01:19 AM
31,04/22/2020 12:00:00 AM,Transmission Type,Community,1,1,17,01/30/2022 04:30:02 AM,01/30/2022 07:01:18 AM
36,10/23/2021 12:00:00 AM,Transmission Type,From Contact,2,0,123,01/30/2022 04:30:02 AM,01/30/2022 07:01:10 AM
37,01/17/2022 12:00:00 AM,Transmission Type,From Contact,2,0,127,01/30/2022 04:30:02 AM,01/30/2022 07:01:11 AM


In [43]:
sample_df.groupby(["Characteristic Type", "Characteristic Group"]).agg({'Cumulative Deaths': ['mean']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Cumulative Deaths
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
Characteristic Type,Characteristic Group,Unnamed: 2_level_2
Age Group,0-4,0.0
Age Group,12-17,0.0
Age Group,18-20,0.0
Age Group,21-24,1.0
Age Group,25-29,0.0
Age Group,30-39,6.5
Age Group,40-49,5.5
Age Group,5-11,0.0
Age Group,50-59,33.0
Age Group,60-69,47.0


### k = 10

In [26]:
k = 10

sample_df = sf_df.groupby(quasi_identifiers).apply(lambda x: x.sample(k, replace=False)).reset_index(drop=True)

In [27]:
sample_df[sample_df["Characteristic Type"]=="Gender"]

Unnamed: 0,Date of Death,Characteristic Type,Characteristic Group,Characteristic Group Sort Order,New Deaths,Cumulative Deaths,Data As Of,Data Loaded At
170,05/23/2021 12:00:00 AM,Gender,Female,1,0,218,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
171,08/12/2020 12:00:00 AM,Gender,Female,1,0,26,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
172,11/04/2021 12:00:00 AM,Gender,Female,1,0,264,01/30/2022 04:30:02 AM,01/30/2022 07:01:03 AM
173,01/01/2022 12:00:00 AM,Gender,Female,1,1,275,01/30/2022 04:30:02 AM,01/30/2022 07:01:03 AM
174,10/28/2020 12:00:00 AM,Gender,Female,1,0,52,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
175,09/11/2021 12:00:00 AM,Gender,Female,1,3,257,01/30/2022 04:30:02 AM,01/30/2022 07:01:03 AM
176,05/27/2020 12:00:00 AM,Gender,Female,1,0,14,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
177,02/16/2021 12:00:00 AM,Gender,Female,1,2,196,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
178,05/03/2020 12:00:00 AM,Gender,Female,1,0,13,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM
179,07/11/2021 12:00:00 AM,Gender,Female,1,0,223,01/30/2022 04:30:02 AM,01/30/2022 07:01:02 AM


In [31]:
sample_df.groupby(["Characteristic Type", "Characteristic Group"]).agg({'Cumulative Deaths': ['mean']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Cumulative Deaths
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
Characteristic Type,Characteristic Group,Unnamed: 2_level_2
Age Group,0-4,0.0
Age Group,12-17,0.0
Age Group,18-20,0.0
Age Group,21-24,1.8
Age Group,25-29,0.0
Age Group,30-39,6.5
Age Group,40-49,15.5
Age Group,5-11,0.0
Age Group,50-59,17.6
Age Group,60-69,34.9


END