# Create baseline master list of UIDs

In [41]:
import pandas as pd
import string
import itertools
import numpy as np

## Generate all UIDs
- Absolute ID will be a 5 char string of lowecase letters and digits -> 60466176 IDs
- Leaving out the letters [l, b, o, g, q] due to their similarity to [1, 6, 0, 9, 9] -> 28629151 IDs
- Reserve ID spaces for technology providers
  - 10 _ _ _ -> 29791 IDs for 10X Genomics
  - vg _ _ _ -> 29791 IDs for Vizgen
  - ns _ _ _ -> 29791 IDs for Nanostring
  - xx _ _ _ -> 29791 IDs for miscellaneous, such as publication

In [86]:
def generate_all_possible_ids(
    size: int = 5, chars=string.ascii_lowercase + string.digits, exclude: str = "lbogq", prefix: str = "" 
):
    """Generate all possible permutations of IDs."""
    excluded_chars = set(exclude)
    filtered_chars = "".join(c for c in chars if c not in excluded_chars)

    return [prefix + "".join(p) for p in itertools.product(filtered_chars, repeat=size)]

uids_for_10x = pd.DataFrame({"uid": generate_all_possible_ids(size=3, prefix="10")})
uids_for_vizgen = pd.DataFrame({"uid": generate_all_possible_ids(size=3, prefix="vg")})
uids_for_nanostring = pd.DataFrame({"uid": generate_all_possible_ids(size=3, prefix="ns")})
uids_for_misc = pd.DataFrame({"uid": generate_all_possible_ids(size=3, prefix="xx")})

uids = pd.concat([uids_for_10x, uids_for_vizgen, uids_for_nanostring, uids_for_misc]).reset_index(drop=True)

# Assign source to ID range
prefix_source_map = [
    ("10", "10x"),
    ("vg", "vizgen"),
    ("ns", "nanostring"),
    ("xx", "misc"),
]

conditions_choices = [
    (uids["uid"].str.startswith(prefix), source) for prefix, source in prefix_source_map
]
conditions, choices = zip(*conditions_choices)
uids["source"] = np.select(conditions, choices, default="unknown")

uids["id"] = None

# uids.to_csv("../data/uid_master.csv", index=False)
uids

Unnamed: 0,uid,source,id
0,10aaa,10x,
1,10aac,10x,
2,10aad,10x,
3,10aae,10x,
4,10aaf,10x,
...,...,...,...
119159,xx995,misc,
119160,xx996,misc,
119161,xx997,misc,
119162,xx998,misc,
