In [18]:
!ls /home/jupyter/20000360102458359xu/MarketScan/CCAEMDCR

'2014 v30'  '2017 v30'	'2020 v20'  '2023 v04'	 nohup.out
'2015 v40'  '2018 v30'	'2021 v10'  '2023 v10'
'2016 v30'  '2019 v50'	'2022 v10'  '2024 v01'


### Naming Pattern of the parquet files

In [19]:
!ls /home/jupyter/20000360102458359xu/MarketScan/CCAEMDCR/2014\ v30/Set\ A/PARQUET

ccaea143_a_0_0_0.snappy.parquet   ccaeo143_a_0_4_29.snappy.parquet
ccaea143_a_0_0_1.snappy.parquet   ccaeo143_a_0_4_3.snappy.parquet
ccaea143_a_0_0_2.snappy.parquet   ccaeo143_a_0_4_30.snappy.parquet
ccaea143_a_0_1_0.snappy.parquet   ccaeo143_a_0_4_31.snappy.parquet
ccaea143_a_0_1_1.snappy.parquet   ccaeo143_a_0_4_32.snappy.parquet
ccaea143_a_0_2_0.snappy.parquet   ccaeo143_a_0_4_33.snappy.parquet
ccaea143_a_0_2_1.snappy.parquet   ccaeo143_a_0_4_34.snappy.parquet
ccaea143_a_0_3_0.snappy.parquet   ccaeo143_a_0_4_35.snappy.parquet
ccaea143_a_0_3_1.snappy.parquet   ccaeo143_a_0_4_36.snappy.parquet
ccaea143_a_0_3_2.snappy.parquet   ccaeo143_a_0_4_4.snappy.parquet
ccaea143_a_0_4_0.snappy.parquet   ccaeo143_a_0_4_5.snappy.parquet
ccaea143_a_0_4_1.snappy.parquet   ccaeo143_a_0_4_6.snappy.parquet
ccaea143_a_0_4_2.snappy.parquet   ccaeo143_a_0_4_7.snappy.parquet
ccaea143_a_0_5_0.snappy.parquet   ccaeo143_a_0_4_8.snappy.parquet
ccaea143_a_0_5_1.snappy.parquet   ccaeo143_a_0_4_9.snappy.parquet
cc

In [20]:
!ls /home/jupyter/20000360102458359xu/MarketScan/LAB/2014\ v30/Set\ A/PARQUET

ccaer143_a_0_0_0.snappy.parquet  ccaer143_a_0_6_0.snappy.parquet
ccaer143_a_0_0_1.snappy.parquet  ccaer143_a_0_6_1.snappy.parquet
ccaer143_a_0_1_0.snappy.parquet  ccaer143_a_0_7_0.snappy.parquet
ccaer143_a_0_1_1.snappy.parquet  ccaer143_a_0_7_1.snappy.parquet
ccaer143_a_0_2_0.snappy.parquet  mdcrr143_a_0_0_0.snappy.parquet
ccaer143_a_0_2_1.snappy.parquet  mdcrr143_a_0_1_0.snappy.parquet
ccaer143_a_0_3_0.snappy.parquet  mdcrr143_a_0_2_0.snappy.parquet
ccaer143_a_0_3_1.snappy.parquet  mdcrr143_a_0_3_0.snappy.parquet
ccaer143_a_0_4_0.snappy.parquet  mdcrr143_a_0_4_0.snappy.parquet
ccaer143_a_0_4_1.snappy.parquet  mdcrr143_a_0_5_0.snappy.parquet
ccaer143_a_0_5_0.snappy.parquet  mdcrr143_a_0_6_0.snappy.parquet
ccaer143_a_0_5_1.snappy.parquet  mdcrr143_a_0_7_0.snappy.parquet


In [21]:
import sys
import os
import re
import json
import pandas as pd
from collections import defaultdict



In [22]:

def parse_parquet_files(data_path: str, year: str, prefix: str) -> dict:
    """
    Group parquet files by data source from a specific year
    
    Args:
        folder_path: Path to the folder containing parquet files from different data tables
        year: Year extracted from the folder path
        prefix: Database prefix to look for (e.g., 'ccae', 'mdcr')
        
    Returns:
        - grouped_files: Dictionary with structure {prefix_table_code_year: [list of files]}
    """
    
    # Dictionary to store grouped files
    grouped_files = defaultdict(list)
    
    # Get all parquet files in the folder matching the prefix
    file_list = [os.path.join(data_path, f) for f in os.listdir(data_path) 
                if f.endswith('.snappy.parquet') and f.startswith(prefix)]
    
    # Group files by table code
    for filepath in file_list:
        filename = os.path.basename(filepath)
        # Get the table code (character after the prefix)
        
        table_code = filename[len(prefix):len(prefix)+1]

        if table_code == "_":
            # take the next character as table code
            table_code = filename[len(prefix)+1:len(prefix)+2]
        # Group key is prefix + table_code
        group_key = f"{prefix}_{table_code}_{year}"
        grouped_files[group_key].append(filepath)
    
    # Sort files within each group
    for group in grouped_files:
        grouped_files[group].sort()
    
    return dict(grouped_files)

### claims data

In [23]:

path_template = "/home/jupyter/20000360102458359xu/MarketScan/CCAEMDCR/{year_folder}/Set A/PARQUET"
years = !ls /home/jupyter/20000360102458359xu/MarketScan/CCAEMDCR

# Filter valid years that start with "20xx"
valid_years = [y for y in years if re.match(r"^20\d{2}", y)]
  # List of prefixes

# Dictionary to store combined results
mdcr_paths = defaultdict(list)

# Iterate over all combinations of data_path and prefix 
for year_folder in valid_years:
    result = parse_parquet_files(data_path=path_template.format(year_folder=year_folder), 
                                 year= re.search(r"\b20\d{2}\b", year_folder).group(),
                                 prefix='mdcr')
    
    # Merge dictionaries by appending values to lists
    for key, file_list in result.items():
        mdcr_paths[key].extend(file_list)





### lab data

In [24]:
path_template_lab = "/home/jupyter/20000360102458359xu/MarketScan/LAB/{year_folder}/Set A/PARQUET"
years_lab = !ls /home/jupyter/20000360102458359xu/MarketScan/LAB

# Filter valid years that start with "20xx"
valid_years_lab = [y for y in years_lab if re.match(r"^20\d{2}", y)]
  # List of prefixes


# Iterate over all combinations of data_path and prefix
for year_folder in valid_years_lab:
    result = parse_parquet_files(data_path=path_template_lab.format(year_folder=year_folder), 
                                 year= re.search(r"\b20\d{2}\b", year_folder).group(),
                                 prefix='mdcr')
    
    # Merge dictionaries by appending values to lists
    for key, file_list in result.items():
        mdcr_paths[key].extend(file_list)

# Convert to a normal dictionary if needed
mdcr_data_paths = dict(mdcr_paths)

### sanity check for table and year count 

In [25]:
# sanity check for table and year count
import pandas as pd

sanity_check = pd.DataFrame({"keys": list(mdcr_data_paths.keys())})
sanity_check["table"] = sanity_check["keys"].str.extract(r'mdcr_(.)_\d{4}$')
sanity_check["year"] = sanity_check["keys"].str.extract(r'(\d{4})$')  # Extract year

# group by table and count how many years for each table
sanity_check.groupby("table").size().reset_index(name="year_count")


Unnamed: 0,table,year_count
0,a,11
1,d,11
2,f,11
3,i,11
4,o,11
5,r,10
6,s,11
7,t,11


### Write the data into a csv file

In [26]:
# write ccae_data_paths to a json file
with open("mdcr_data_paths_dict.json", "w") as f:
    json.dump(mdcr_data_paths, f)
