In [163]:
import pandas as pd
import random
import string
from itertools import combinations

pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None) 

In [226]:
random.seed(42)
def generate_primary_id(prefix):
    number = ''.join(random.choices(string.digits, k=6))
    letter = random.choice(['C', 'N', 'E', 'S'])
    return prefix + number + letter

def generate_unique_primary_ids(count):
    unique_ids = [generate_primary_id('A') for _ in range(count//2)] + [generate_primary_id('E') for _ in range(count//2)]
    random.shuffle(unique_ids) 
    return unique_ids

def generate_unique_pairs(unique_ids, pair_count):
    all_possible_pairs = list(combinations(unique_ids, 2))
    random.shuffle(all_possible_pairs) 
    return all_possible_pairs[:pair_count]

def generate_npi(num_npi, total):
    npis = [''.join(random.choices(string.digits, k=10)) for _ in range(num_npi)]
    npis.extend([''] * (total - num_npi)) 
    random.shuffle(npis) 
    return npis
    
def generate_name():
    person_names = ['John Doe', 'Jane Smith', 'Emily Jones', 'Michael Brown', 'Jessica Davis', 'Daniel Garcia']
    company_names = ['Acme Corp', 'Globex LLC', 'Soylent Co', 'Initech', 'Umbrella Inc', 'Vandelay Industries']
    return random.choice(person_names + company_names)

judicial_districts = ["CAC", "NYE", "TXS", "FLM", "ILN", "PAE", "TXW", "CAS", "NYN", "FLS"]

addresses = [
    '123 Main St, Anytown, 48392', '456 Elm St, Smallville, 76128', '789 Maple Ave, Bigcity, 52486',
    '101 Oak St, Littletown, 93765', '202 Pine St, Yourtown, 21058'
]

facility_type = ['Skilled Nursing Facility', 'Short-term Care', 'Long-term Care', 'Home Health Agencies', 'Hospice']

num_observations = 8 #always an even number
num_npi = num_facility_type = 3

unique_ids = generate_unique_primary_ids(num_observations)

unique_pairs = generate_unique_pairs(unique_ids, 8)

nodes_data = {
    'primary_id': unique_ids,
    'entity_type': [random.randint(1, 4) if id.startswith('E') else '' for id in unique_ids],
    'npi': generate_npi(num_npi, num_observations),
    'name': [generate_name() for _ in range(num_observations)],
    'jd': [random.choice(judicial_districts) for _ in range(num_observations)],
    'address': [random.choice(addresses) for _ in range(num_observations)],
    'facility_type': [random.choice(facility_type) for _ in range(num_observations)]
}

nodes_df = pd.DataFrame(nodes_data)

display(nodes_df)


Unnamed: 0,primary_id,entity_type,npi,name,jd,address,facility_type
0,E798834E,4.0,,Globex LLC,FLS,"101 Oak St, Littletown, 93765",Hospice
1,A602276C,,,John Doe,NYE,"202 Pine St, Yourtown, 21058",Short-term Care
2,E687593E,3.0,,Jane Smith,NYE,"456 Elm St, Smallville, 76128",Long-term Care
3,A258086E,,5001674039.0,Daniel Garcia,CAS,"789 Maple Ave, Bigcity, 52486",Home Health Agencies
4,E868570N,2.0,,Jessica Davis,NYE,"202 Pine St, Yourtown, 21058",Long-term Care
5,E136760N,2.0,4498251982.0,Michael Brown,NYN,"202 Pine St, Yourtown, 21058",Home Health Agencies
6,A500265S,,5980765261.0,John Doe,TXS,"101 Oak St, Littletown, 93765",Hospice
7,A227133E,,,Michael Brown,TXS,"456 Elm St, Smallville, 76128",Home Health Agencies


In [232]:
def create_links_df(a_prefixed_ids, e_prefixed_ids, num_rows):
    unique_pairs = set()

    links_data = {
        'entity_id_1': [],
        'entity_id_2': [],
        'role_type': []
    }

    while len(links_data['entity_id_1']) < num_rows and len(unique_pairs) < len(a_prefixed_ids) * len(e_prefixed_ids):
        a_id = random.choice(a_prefixed_ids)
        e_id = random.choice(e_prefixed_ids)

        if (a_id, e_id) not in unique_pairs:
            unique_pairs.add((a_id, e_id))
            links_data['entity_id_1'].append(a_id)
            links_data['entity_id_2'].append(e_id)
            links_data['role_type'].append(random.randint(1, 10))
    
    links_df = pd.DataFrame(links_data)

    links_df = links_df.sample(frac=1).reset_index(drop=True)
    
    return links_df

num_rows_desired = 5
links_df = create_links_df(a_prefixed_ids, e_prefixed_ids, num_rows_desired)

display(links_df)

Unnamed: 0,entity_id_1,entity_id_2,role_type
0,A258086E,E868570N,5
1,A602276C,E687593E,2
2,A500265S,E227133E,2
3,A258086E,E687593E,9
4,A500265S,E687593E,9


In [43]:
#FROM DATA PROCESSING 
#primary_id :a
#entity_type: b
#npi:c
#name:d
#judicial_district:f
#address:g
#n_owners_2nd_degree:m
#ccn_links:n
#ccn_type_cd:o

#ENTITY_TYPE
#1: Associate
#2: Part A Provider
#3: DME Supplier
#4: Part B Supplier



In [233]:
display(links_df)

first_degree_df = links_df.groupby('entity_id_1')['entity_id_2'].unique().reset_index()
first_degree_df.columns = ['entity_id_1', 'first_degree']
first_degree_df['first_degree'] = first_degree_df['first_degree'].apply(list)
display(first_degree_df)


entity_to_associate_df = links_df.groupby('entity_id_2')['entity_id_1'].apply(list).reset_index()
entity_to_associate_df.rename(columns={'entity_id_2': 'entities','entity_id_1': 'agg_associates'}, inplace=True)
display(entity_to_associate_df)

Unnamed: 0,entity_id_1,entity_id_2,role_type
0,A258086E,E868570N,5
1,A602276C,E687593E,2
2,A500265S,E227133E,2
3,A258086E,E687593E,9
4,A500265S,E687593E,9


Unnamed: 0,entity_id_1,first_degree
0,A258086E,"[E868570N, E687593E]"
1,A500265S,"[E227133E, E687593E]"
2,A602276C,[E687593E]


Unnamed: 0,entities,agg_associates
0,E227133E,[A500265S]
1,E687593E,"[A602276C, A258086E, A500265S]"
2,E868570N,[A258086E]


In [242]:
def create_second_degree_df(first_degree_df, entity_associate_df):
    # Create an empty list to store the new DataFrame rows
    second_degree_data = []

    # Iterate over each row in first_degree_df to calculate second_degree associations
    for index, row in first_degree_df.iterrows():
        # Initialize an empty set for second-degree associations
        second_degree_set = set()

        # For each first-degree entity, get their corresponding agg_associates and add to the set
        for first_degree_entity in row['first_degree']:
            # Retrieve agg_associates for the current first_degree_entity
            agg_associates_series = entity_to_associate_df.loc[entity_to_associate_df['entities'] == first_degree_entity, 'agg_associates']

            # If agg_associates are found, add them to the second_degree_set
            if not agg_associates_series.empty:
                second_degree_set.update(agg_associates_series.iloc[0])

        # Remove the first_degree entities and the original entity_id_1 from the second-degree set
        second_degree_set.difference_update(row['first_degree'], [row['entity_id_1']])

        # Add the calculated second-degree associations to the row data
        second_degree_data.append({
            'entity_id_1': row['entity_id_1'],
            'first_degree': row['first_degree'],
            'second_degree': list(second_degree_set)
        })

    # Create a DataFrame from the second_degree_data list
    second_degree_df = pd.DataFrame(second_degree_data)

    return second_degree_df

# Assuming first_degree_df and entity_associate_df are already defined and loaded as shown in the screenshot
# Call the function to create second_degree_df
second_degree_df = create_second_degree_df(first_degree_df, entity_associate_df)

# Display the second_degree_df DataFrame
display(links_df)
display(entity_to_associate_df)
display(second_degree_df)


Unnamed: 0,entity_id_1,entity_id_2,role_type
0,A258086E,E868570N,5
1,A602276C,E687593E,2
2,A500265S,E227133E,2
3,A258086E,E687593E,9
4,A500265S,E687593E,9


Unnamed: 0,entities,agg_associates
0,E227133E,[A500265S]
1,E687593E,"[A602276C, A258086E, A500265S]"
2,E868570N,[A258086E]


Unnamed: 0,entity_id_1,first_degree,second_degree
0,A258086E,"[E868570N, E687593E]","[A500265S, A602276C]"
1,A500265S,"[E227133E, E687593E]","[A258086E, A602276C]"
2,A602276C,[E687593E],"[A500265S, A258086E]"
