In [163]:
import pandas as pd
import random
import string
from itertools import combinations

pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None) 

In [223]:
random.seed(42)
def generate_primary_id(prefix):
    number = ''.join(random.choices(string.digits, k=6))
    letter = random.choice(['C', 'N', 'E', 'S'])
    return prefix + number + letter

def generate_unique_primary_ids(count):
    unique_ids = [generate_primary_id('A') for _ in range(count//2)] + [generate_primary_id('E') for _ in range(count//2)]
    random.shuffle(unique_ids) 
    return unique_ids

def generate_unique_pairs(unique_ids, pair_count):
    all_possible_pairs = list(combinations(unique_ids, 2))
    random.shuffle(all_possible_pairs) 
    return all_possible_pairs[:pair_count]

def generate_npi(num_npi, total):
    npis = [''.join(random.choices(string.digits, k=10)) for _ in range(num_npi)]
    npis.extend([''] * (total - num_npi)) 
    random.shuffle(npis) 
    return npis
    
def generate_name():
    person_names = ['John Doe', 'Jane Smith', 'Emily Jones', 'Michael Brown', 'Jessica Davis', 'Daniel Garcia']
    company_names = ['Acme Corp', 'Globex LLC', 'Soylent Co', 'Initech', 'Umbrella Inc', 'Vandelay Industries']
    return random.choice(person_names + company_names)

judicial_districts = ["CAC", "NYE", "TXS", "FLM", "ILN", "PAE", "TXW", "CAS", "NYN", "FLS"]

addresses = [
    '123 Main St, Anytown, 48392', '456 Elm St, Smallville, 76128', '789 Maple Ave, Bigcity, 52486',
    '101 Oak St, Littletown, 93765', '202 Pine St, Yourtown, 21058'
]

facility_type = ['Skilled Nursing Facility', 'Short-term Care', 'Long-term Care', 'Home Health Agencies', 'Hospice']

num_observations = 8 #always an even number
num_npi = num_facility_type = 3

unique_ids = generate_unique_primary_ids(num_observations)

unique_pairs = generate_unique_pairs(unique_ids, 8)

nodes_data = {
    'primary_id': unique_ids,
    'entity_type': [random.randint(1, 4) if id.startswith('E') else '' for id in unique_ids],
    'npi': generate_npi(num_npi, num_observations),
    'name': [generate_name() for _ in range(num_observations)],
    'jd': [random.choice(judicial_districts) for _ in range(num_observations)],
    'address': [random.choice(addresses) for _ in range(num_observations)],
    'facility_type': [random.choice(facility_type) for _ in range(num_observations)]
}

nodes_df = pd.DataFrame(nodes_data)

display(nodes_df)

def create_links_df(a_prefixed_ids, e_prefixed_ids, num_rows):
    # Initialize a set to keep track of unique pairs
    unique_pairs = set()
    
    # Initialize the dictionary to hold link data
    links_data = {
        'entity_id_1': [],
        'entity_id_2': [],
        'role_type': []
    }
    
    # Generate unique pairs until we have the desired number of rows or we exhaust all possible unique pairs
    while len(links_data['entity_id_1']) < num_rows and len(unique_pairs) < len(a_prefixed_ids) * len(e_prefixed_ids):
        # Randomly pick one 'A' prefixed and one 'E' prefixed ID
        a_id = random.choice(a_prefixed_ids)
        e_id = random.choice(e_prefixed_ids)
        
        # Check if this is a new unique pair
        if (a_id, e_id) not in unique_pairs:
            # Add the new unique pair to the tracking set and to the links data
            unique_pairs.add((a_id, e_id))
            links_data['entity_id_1'].append(a_id)
            links_data['entity_id_2'].append(e_id)
            links_data['role_type'].append(random.randint(1, 10))
    
    # Create the DataFrame from the links data
    links_df = pd.DataFrame(links_data)
    
    # Shuffle the DataFrame
    links_df = links_df.sample(frac=1).reset_index(drop=True)
    
    return links_df

num_rows_desired = 16
links_df = create_links_df(a_prefixed_ids, e_prefixed_ids, num_rows_desired)

display(links_df)


Unnamed: 0,primary_id,entity_type,npi,name,jd,address,facility_type
0,E798834E,4.0,,Globex LLC,FLS,"101 Oak St, Littletown, 93765",Hospice
1,A602276C,,,John Doe,NYE,"202 Pine St, Yourtown, 21058",Short-term Care
2,E687593E,3.0,,Jane Smith,NYE,"456 Elm St, Smallville, 76128",Long-term Care
3,A258086E,,5001674039.0,Daniel Garcia,CAS,"789 Maple Ave, Bigcity, 52486",Home Health Agencies
4,E868570N,2.0,,Jessica Davis,NYE,"202 Pine St, Yourtown, 21058",Long-term Care
5,E136760N,2.0,4498251982.0,Michael Brown,NYN,"202 Pine St, Yourtown, 21058",Home Health Agencies
6,A500265S,,5980765261.0,John Doe,TXS,"101 Oak St, Littletown, 93765",Hospice
7,A227133E,,,Michael Brown,TXS,"456 Elm St, Smallville, 76128",Home Health Agencies


Unnamed: 0,entity_id_1,entity_id_2,role_type
0,A500265S,E687593E,1
1,A602276C,E687593E,9
2,A500265S,E227133E,1
3,A602276C,E227133E,7
4,A500265S,E868570N,4
5,A602276C,E868570N,2
6,A258086E,E868570N,1
7,A258086E,E687593E,4
8,A258086E,E227133E,4


In [43]:
#FROM DATA PROCESSING 
#primary_id :a
#entity_type: b
#npi:c
#name:d
#judicial_district:f
#address:g
#n_owners_2nd_degree:m
#ccn_links:n
#ccn_type_cd:o

#ENTITY_TYPE
#1: Associate
#2: Part A Provider
#3: DME Supplier
#4: Part B Supplier



In [218]:
num_rows_links_df = 10
links_df = create_links_df(a_prefixed_ids, e_prefixed_ids, num_rows_links_df)
display(links_df)

first_degree_df = links_df.groupby('entity_id_1')['entity_id_2'].unique().reset_index()
first_degree_df.columns = ['entity_id_1', 'first_degree']
first_degree_df['first_degree'] = first_degree_df['first_degree'].apply(list)
display(first_degree_df)


entity_to_associate_df = links_df.groupby('entity_id_2')['entity_id_1'].apply(list).reset_index()
entity_to_associate_df.rename(columns={'entity_id_2': 'entities','entity_id_1': 'agg_associates'}, inplace=True)
display(entity_to_associate_df)

Unnamed: 0,entity_id_1,entity_id_2,role_type
0,A602276C,E687593E,8
1,A258086E,E687593E,1
2,A500265S,E227133E,2
3,A500265S,E868570N,2
4,A258086E,E227133E,4
5,A258086E,E868570N,10
6,A602276C,E227133E,4
7,A602276C,E868570N,2
8,A500265S,E687593E,3


Unnamed: 0,entity_id_1,first_degree
0,A258086E,"[E687593E, E227133E, E868570N]"
1,A500265S,"[E227133E, E868570N, E687593E]"
2,A602276C,"[E687593E, E227133E, E868570N]"


Unnamed: 0,entities,agg_associates
0,E227133E,"[A500265S, A258086E, A602276C]"
1,E687593E,"[A602276C, A258086E, A500265S]"
2,E868570N,"[A500265S, A258086E, A602276C]"
