In [23]:
import os
import zipfile
import io
import csv
import pandas as pd
from gdc.utils import data_path, ExtendedNamespace

In [24]:
CMS_DATA_PATH = data_path('cms_data.txt')
CMS_DATA_PATH

'/home/sylvain/Dropbox/Econ/papiers/gameTheory/generous_dynamic_contracting/datasets/us'

In [25]:
def get_zip_files():
    zip_files = [os.path.join(CMS_DATA_PATH, f) 
                 for f in os.listdir(CMS_DATA_PATH)
                 if f.lower().endswith(".zip")]
    return ExtendedNamespace(**{z.split('/')[-1].rstrip('.zip') : z for z in zip_files})
    
    
ZF = get_zip_files()

In [26]:
def get_zipfile_metrics(zip_path):
    """
    Return (column_names, num_columns, num_rows) for a CSV inside a ZIP.
    Counts rows via streamed iteration (safe for very large files).
    """
    with zipfile.ZipFile(zip_path, "r") as z:
        csv_name = next(n for n in z.namelist() if n.lower().endswith(".csv"))
        
        with z.open(csv_name, "r") as f:
            text_stream = io.TextIOWrapper(f, encoding="utf-8", newline="")
            reader = csv.reader(text_stream)
            
            # Read header
            try:
                header = next(reader)
            except StopIteration:
                return [], 0, 0
            
            num_cols = len(header)
            
            # Count rows line-by-line
            num_rows = sum(1 for _ in reader)
            
    return ExtendedNamespace(**{'columns': header, 'num_cols': num_cols, 'num_rows': num_rows})

get_zipfile_metrics(ZF.DE1_0_2008_Beneficiary_Summary_File_Sample_1)

namespace(columns=['DESYNPUF_ID',
                   'BENE_BIRTH_DT',
                   'BENE_DEATH_DT',
                   'BENE_SEX_IDENT_CD',
                   'BENE_RACE_CD',
                   'BENE_ESRD_IND',
                   'SP_STATE_CODE',
                   'BENE_COUNTY_CD',
                   'BENE_HI_CVRAGE_TOT_MONS',
                   'BENE_SMI_CVRAGE_TOT_MONS',
                   'BENE_HMO_CVRAGE_TOT_MONS',
                   'PLAN_CVRG_MOS_NUM',
                   'SP_ALZHDMTA',
                   'SP_CHF',
                   'SP_CHRNKIDN',
                   'SP_CNCR',
                   'SP_COPD',
                   'SP_DEPRESSN',
                   'SP_DIABETES',
                   'SP_ISCHMCHT',
                   'SP_OSTEOPRS',
                   'SP_RA_OA',
                   'SP_STRKETIA',
                   'MEDREIMB_IP',
                   'BENRES_IP',
                   'PPPYMT_IP',
                   'MEDREIMB_OP',
                   'BENRES_OP',
        

In [27]:
def zip_chunk_generator(zip_path, batch_size, max_batches=None, usecols=None):
    zip_path = os.path.join(CMS_DATA_PATH, zip_path)
    with zipfile.ZipFile(zip_path) as z:
        # detect inner CSV file
        csv_name = next(n for n in z.namelist() if n.lower().endswith(".csv"))

        with z.open(csv_name) as f:
            # wrap raw bytes → text
            text_stream = io.TextIOWrapper(f, encoding="utf-8", newline="")

            # streamed chunk reader
            reader = pd.read_csv(
                text_stream,
                chunksize=batch_size,
                low_memory=False,
                usecols=usecols,
            )

            for i, chunk in enumerate(reader):
                yield chunk

                if max_batches is not None and (i + 1) >= max_batches:
                    break

chunks = zip_chunk_generator(ZF.DE1_0_2008_Beneficiary_Summary_File_Sample_1, 10, 1)

In [28]:
next(chunks)

Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
0,00013D2EFD8E45D1,19230501,,1,1,0,26,950,12,12,...,2,0.0,0.0,0.0,50.0,10.0,0.0,0.0,0.0,0.0
1,00016F745862898F,19430101,,1,1,0,39,230,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,700.0,240.0,0.0
2,0001FDD721E223DC,19360901,,2,1,0,39,280,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00021CA6FF03E670,19410601,,1,5,0,6,290,0,0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00024B3D2352D2D0,19360801,,1,1,0,52,590,12,12,...,2,0.0,0.0,0.0,30.0,40.0,0.0,220.0,80.0,0.0
5,0002DAE1C81CC70D,19431001,,1,2,0,33,400,0,0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0002F28CE057345B,19220701,,1,1,0,39,270,12,12,...,2,0.0,0.0,0.0,1010.0,270.0,0.0,3330.0,940.0,0.0
7,000308435E3E5B76,19350901,,1,1,0,24,680,10,10,...,2,0.0,0.0,0.0,150.0,160.0,0.0,870.0,340.0,80.0
8,000345A39D4157C9,19760901,,2,1,0,23,810,0,0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,00036A21B65B0206,19381001,,2,2,0,1,570,12,12,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
