In [None]:
# Ensure that saving data_utils updates functions accessible in analysis.
%reload_ext autoreload
%autoreload 2

In [None]:
from data_utils import read_csv_rows
from data_utils import columnar

DATA_DIRECTORY="../../data"
DATA_FILE_PATH1=f"{DATA_DIRECTORY}/radxup_project_characteristics.csv"

# Read and rearrange project characteristics.
data_rows: list[dict[str, str]] = read_csv_rows(DATA_FILE_PATH1)
data_cols: dict[str, list[str]] = columnar(data_rows)

print(dict.keys(data_cols))
print(data_cols["\ufeffsite_number"])

In [None]:
# Define demographic attributes of projects
amer_indian: str = data_cols["amer_indian"]
pacific_islander: str = data_cols["pacific_islander"]
black: str = data_cols["black"]
hisp_latinx: str = data_cols["hisp_latinx"]
asian: str = data_cols["asian"]
imms_refs: str = data_cols["imms_refs"]
incarcerated: str = data_cols["incarcerated"]
drug_users: str = data_cols["drug_users"]
pregnant: str = data_cols["pregnant"]
youth: str = data_cols["youth"]
rural: str = data_cols["rural"]
low_ses: str = data_cols["low_ses"]
older: str = data_cols["older"]
disabled: str = data_cols["disabled"]
lgbtq: str = data_cols["lgbtq"]

sum_marginalized: str = data_cols["sum_marginalized"] # Derived variable defined as the sum of "low_ses", "incarcerated", and "drug_users".
sum_accessibility: str = data_cols["sum_accessibility"] # Derived variable definied as the sum of "older" and "disabled".


# Define study design attributes of projects.
experimental: str = data_cols["experimental"]
observational: str = data_cols["observational"]
hybrid: str = data_cols["hybrid"]
repository: str = data_cols["repository"]


# Define temporal attributes of projects.
cross_sectional: str = data_cols["cross_sectional"]
longitudinal: str = data_cols["longitudal"]
multi_temporal: str = data_cols["multi_temporal"]


# Define geographic area attributes of projects.
northeast: str = data_cols["northeast"]
midwest: str = data_cols["midwest"]
south: str = data_cols["south"]
west: str = data_cols["west"]
territory: str = data_cols["territory"]
multistate: str = data_cols["multistate"]


# Define target numbers for project attributes.
t1: int = 1
t2: int = 2
t3: int = 3
t4: int = 4

In [1]:
# Establish the count of ALL possible sample sets -- 69 choose 9, order does not matter.
n = 69 * 68 * 67 * 66 * 65 * 64 * 63 * 62 * 61 / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1)
print(n)

56672074888.0


In [None]:
from data_utils import sample_initializer
from data_utils import sample_adder

# Set project attributes and targets for project count to construct possible sample sets.
demographs = [lgbtq, rural, experimental, pregnant, youth, amer_indian, imms_refs, sum_marginalized, disabled]
targets = [t1, t1, t1, t1, t1, t1, t1, t1, t1]

# Initialize list of possible sample sets.
sample_of_nine = sample_initializer(demographs[0])

# Complete list of possible sample sets.
for i in range(1,9):
    sample_of_nine = sample_adder(sample_of_nine, demographs[i])

len(sample_of_nine)

In [None]:
from data_utils import sample_checker

# Set sequential attributes to narrow sample sets.
char: list[str] =  [black, hisp_latinx, pacific_islander, older, repository, 
    low_ses, drug_users, incarcerated, longitudinal, cross_sectional,
    south, hybrid, amer_indian, multi_temporal, youth,
    midwest, west, northeast, multistate, rural,
    south, experimental, observational, hybrid, disabled]

# Set sequential targets for attributes to narrow sample sets.
target: list[int] = [t4, t4, t1, t2, t1,
    t2, t1, t1, t2, t2,
    t1, t1, t2, t2, t2,
    t1, t1, t1, t1, t2,
    t2, t2, t2, t2, t2]

# Narrow sample sets in listed order and print intermediate sample size after each step.
i: int = 0
while len(sample_of_nine) > 1:
    sample_of_nine = sample_checker(sample_of_nine, char[i], target[i])
    print(f"Filter{i} ({char[i]} >= {target[i]}): {len(sample_of_nine)}")
    i += 1

In [None]:
# Return project numbers of final sample set.
true_list: list[int] = []
for i in range(len(sample_of_nine)):
    for j in range(len(sample_of_nine[i])):
        true_list.append((data_cols["project_number"])[(sample_of_nine[i])[j]])

print(true_list)