In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import relationship, sessionmaker

In [37]:
from pathlib import Path
import os
import sys
import math
import random
import traceback
import pickle
import gc

In [3]:
import db

In [4]:
from tqdm.notebook import tqdm

In [5]:
os.environ['DB_CONNECTION_STRING'] = "sqlite:///../humana.sqlite"
db_engine = db.get_db_engine()
Session = sessionmaker(db_engine)

Global db engine has not been initialized. Initializing...
Initialized database engine


In [6]:
print("Getting member ids")
with Session() as sess:
    member_ids = [r[0] for r in sess.query(db.Member.member_id).all()]
print(f"{len(member_ids)=}")

Getting member ids
len(member_ids)=1909880


In [7]:
BATCH = 100

# One-to-many CSVs

## Quality Data

In [8]:
interestcols = ['member_id', 'measure_type', 'compliant_cnt', 'eligible_cnt']

In [69]:
with Session() as sess:
    q = sess.query(*[getattr(db.RawQualityData, c) for c in interestcols])
    rows = []
    try:
        for irow, row in tqdm(enumerate(q.yield_per(BATCH))):
            rows.append(row)
            #if irow >= 10000:
            #    break
    except:
        sess.rollback()
        traceback.print_exc()

0it [00:00, ?it/s]

In [14]:
rows[0]

(40240, <EMeasureType.PatientSafety: 'Patient Safety'>, 1.0, 1)

In [70]:
df = pd.DataFrame(rows, columns=interestcols)

In [71]:
df

Unnamed: 0,member_id,measure_type,compliant_cnt,eligible_cnt
0,40240,EMeasureType.PatientSafety,1.0,1
1,40240,EMeasureType.PatientSafety,1.0,1
2,193657,EMeasureType.HEDIS,0.0,1
3,1685072,EMeasureType.PatientSafety,0.0,1
4,1685072,EMeasureType.PatientSafety,1.0,1
...,...,...,...,...
41950222,1062632,EMeasureType.PatientExperience,1.0,1
41950223,1425434,EMeasureType.PatientExperience,0.0,1
41950224,1104218,EMeasureType.PatientExperience,1.0,1
41950225,669409,EMeasureType.PatientExperience,1.0,1


In [72]:
dummies = pd.get_dummies(df['measure_type']).astype(int)
display(dummies)
#df[[f'measure_type_{i}' for i in range(len(dummies.columns))]] = dummies

Unnamed: 0,EMeasureType.PatientSafety,EMeasureType.HEDIS,EMeasureType.PatientExperience
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
41950222,0,0,1
41950223,0,0,1
41950224,0,0,1
41950225,0,0,1


In [73]:
# Run this code ONLY ONCE after deriving df
def get_vectors(df, e):
    vecs = df[df['measure_type'] == e].groupby('member_id')[['compliant_cnt', 'eligible_cnt']].sum()
    vecs['compliant_to_eligible_ratio'] = vecs['compliant_cnt'] / vecs['eligible_cnt'].where(vecs['eligible_cnt'] >= 1, 1.0)
    print(f"{vecs['compliant_cnt'].max()=} {vecs['compliant_cnt'].mean()=}")
    print(f"{vecs['eligible_cnt'].max()=} {vecs['eligible_cnt'].mean()=}")
    return vecs

newcols_all = []
for ie, e in enumerate(df['measure_type'].unique()):
    vecs  = get_vectors(df, e)
    newcols = [f'measure_type_{ie}_{c}' for c in vecs.columns]
    vecs.columns = newcols
    df = df.merge(vecs[newcols], how='left', left_on='member_id', right_on='member_id')
    newcols_all.extend(newcols)
display(df)

vecs['compliant_cnt'].max()=np.float64(19.0) vecs['compliant_cnt'].mean()=np.float64(3.697508101744354)
vecs['eligible_cnt'].max()=np.int64(20) vecs['eligible_cnt'].mean()=np.float64(4.216287497500236)
vecs['compliant_cnt'].max()=np.float64(45.0) vecs['compliant_cnt'].mean()=np.float64(4.19332877233415)
vecs['eligible_cnt'].max()=np.int64(58) vecs['eligible_cnt'].mean()=np.float64(6.3062775976446295)
vecs['compliant_cnt'].max()=np.float64(947.88) vecs['compliant_cnt'].mean()=np.float64(27.41920372687088)
vecs['eligible_cnt'].max()=np.int64(954) vecs['eligible_cnt'].mean()=np.float64(33.19336240483254)


Unnamed: 0,member_id,measure_type,compliant_cnt,eligible_cnt,measure_type_0_compliant_cnt,measure_type_0_eligible_cnt,measure_type_0_compliant_to_eligible_ratio,measure_type_1_compliant_cnt,measure_type_1_eligible_cnt,measure_type_1_compliant_to_eligible_ratio,measure_type_2_compliant_cnt,measure_type_2_eligible_cnt,measure_type_2_compliant_to_eligible_ratio
0,40240,EMeasureType.PatientSafety,1.0,1,6.0,6.0,1.000000,,,,99.00,114.0,0.868421
1,40240,EMeasureType.PatientSafety,1.0,1,6.0,6.0,1.000000,,,,99.00,114.0,0.868421
2,193657,EMeasureType.HEDIS,0.0,1,,,,2.0,4.0,0.500000,,,
3,1685072,EMeasureType.PatientSafety,0.0,1,5.0,6.0,0.833333,,,,37.32,38.0,0.982105
4,1685072,EMeasureType.PatientSafety,1.0,1,5.0,6.0,0.833333,,,,37.32,38.0,0.982105
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41950222,1062632,EMeasureType.PatientExperience,1.0,1,6.0,6.0,1.000000,5.0,11.0,0.454545,16.00,16.0,1.000000
41950223,1425434,EMeasureType.PatientExperience,0.0,1,,,,1.0,1.0,1.000000,14.00,20.0,0.700000
41950224,1104218,EMeasureType.PatientExperience,1.0,1,11.0,12.0,0.916667,17.0,21.0,0.809524,61.32,62.0,0.989032
41950225,669409,EMeasureType.PatientExperience,1.0,1,,,,3.0,3.0,1.000000,51.99,69.0,0.753478


In [74]:
for c in newcols_all:
    print(c)
    print("!na ratio:", len(df[c].dropna()) / len(df))
    df.fillna({c: 0}, inplace=True)

measure_type_0_compliant_cnt
!na ratio: 0.8019972812066071
measure_type_0_eligible_cnt
!na ratio: 0.8019972812066071
measure_type_0_compliant_to_eligible_ratio
!na ratio: 0.8019972812066071
measure_type_1_compliant_cnt
!na ratio: 0.9043026155734509
measure_type_1_eligible_cnt
!na ratio: 0.9043026155734509
measure_type_1_compliant_to_eligible_ratio
!na ratio: 0.9043026155734509
measure_type_2_compliant_cnt
!na ratio: 0.8408719456988873
measure_type_2_eligible_cnt
!na ratio: 0.8408719456988873
measure_type_2_compliant_to_eligible_ratio
!na ratio: 0.8408719456988873


In [75]:
df

Unnamed: 0,member_id,measure_type,compliant_cnt,eligible_cnt,measure_type_0_compliant_cnt,measure_type_0_eligible_cnt,measure_type_0_compliant_to_eligible_ratio,measure_type_1_compliant_cnt,measure_type_1_eligible_cnt,measure_type_1_compliant_to_eligible_ratio,measure_type_2_compliant_cnt,measure_type_2_eligible_cnt,measure_type_2_compliant_to_eligible_ratio
0,40240,EMeasureType.PatientSafety,1.0,1,6.0,6.0,1.000000,0.0,0.0,0.000000,99.00,114.0,0.868421
1,40240,EMeasureType.PatientSafety,1.0,1,6.0,6.0,1.000000,0.0,0.0,0.000000,99.00,114.0,0.868421
2,193657,EMeasureType.HEDIS,0.0,1,0.0,0.0,0.000000,2.0,4.0,0.500000,0.00,0.0,0.000000
3,1685072,EMeasureType.PatientSafety,0.0,1,5.0,6.0,0.833333,0.0,0.0,0.000000,37.32,38.0,0.982105
4,1685072,EMeasureType.PatientSafety,1.0,1,5.0,6.0,0.833333,0.0,0.0,0.000000,37.32,38.0,0.982105
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41950222,1062632,EMeasureType.PatientExperience,1.0,1,6.0,6.0,1.000000,5.0,11.0,0.454545,16.00,16.0,1.000000
41950223,1425434,EMeasureType.PatientExperience,0.0,1,0.0,0.0,0.000000,1.0,1.0,1.000000,14.00,20.0,0.700000
41950224,1104218,EMeasureType.PatientExperience,1.0,1,11.0,12.0,0.916667,17.0,21.0,0.809524,61.32,62.0,0.989032
41950225,669409,EMeasureType.PatientExperience,1.0,1,0.0,0.0,0.000000,3.0,3.0,1.000000,51.99,69.0,0.753478


In [76]:
df = df.drop_duplicates(subset='member_id', keep='first')
df

Unnamed: 0,member_id,measure_type,compliant_cnt,eligible_cnt,measure_type_0_compliant_cnt,measure_type_0_eligible_cnt,measure_type_0_compliant_to_eligible_ratio,measure_type_1_compliant_cnt,measure_type_1_eligible_cnt,measure_type_1_compliant_to_eligible_ratio,measure_type_2_compliant_cnt,measure_type_2_eligible_cnt,measure_type_2_compliant_to_eligible_ratio
0,40240,EMeasureType.PatientSafety,1.0,1,6.0,6.0,1.000000,0.0,0.0,0.000000,99.00,114.0,0.868421
2,193657,EMeasureType.HEDIS,0.0,1,0.0,0.0,0.000000,2.0,4.0,0.500000,0.00,0.0,0.000000
3,1685072,EMeasureType.PatientSafety,0.0,1,5.0,6.0,0.833333,0.0,0.0,0.000000,37.32,38.0,0.982105
5,939385,EMeasureType.HEDIS,0.0,1,3.0,3.0,1.000000,2.0,6.0,0.333333,0.00,0.0,0.000000
8,1631297,EMeasureType.PatientSafety,1.0,1,2.0,4.0,0.500000,1.0,1.0,1.000000,0.00,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38612151,1118494,EMeasureType.PatientExperience,1.0,1,0.0,0.0,0.000000,0.0,0.0,0.000000,2.00,10.0,0.200000
39181522,831662,EMeasureType.PatientExperience,1.0,1,0.0,0.0,0.000000,0.0,0.0,0.000000,2.00,6.0,0.333333
40477478,1726589,EMeasureType.PatientExperience,0.0,1,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,3.0,0.000000
40761524,1826208,EMeasureType.PatientExperience,1.0,1,0.0,0.0,0.000000,0.0,0.0,0.000000,6.00,6.0,1.000000


In [77]:
df_quality_data = df[['member_id'] + [c for c in df.columns if c.startswith('measure_type_')]]
df_quality_data

Unnamed: 0,member_id,measure_type_0_compliant_cnt,measure_type_0_eligible_cnt,measure_type_0_compliant_to_eligible_ratio,measure_type_1_compliant_cnt,measure_type_1_eligible_cnt,measure_type_1_compliant_to_eligible_ratio,measure_type_2_compliant_cnt,measure_type_2_eligible_cnt,measure_type_2_compliant_to_eligible_ratio
0,40240,6.0,6.0,1.000000,0.0,0.0,0.000000,99.00,114.0,0.868421
2,193657,0.0,0.0,0.000000,2.0,4.0,0.500000,0.00,0.0,0.000000
3,1685072,5.0,6.0,0.833333,0.0,0.0,0.000000,37.32,38.0,0.982105
5,939385,3.0,3.0,1.000000,2.0,6.0,0.333333,0.00,0.0,0.000000
8,1631297,2.0,4.0,0.500000,1.0,1.0,1.000000,0.00,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
38612151,1118494,0.0,0.0,0.000000,0.0,0.0,0.000000,2.00,10.0,0.200000
39181522,831662,0.0,0.0,0.000000,0.0,0.0,0.000000,2.00,6.0,0.333333
40477478,1726589,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,3.0,0.000000
40761524,1826208,0.0,0.0,0.000000,0.0,0.0,0.000000,6.00,6.0,1.000000


In [79]:
with open("out/compressed_quality_data_df.pkl", 'wb') as f:
    pickle.dump(df_quality_data, f)

## Member Claim

In [8]:
with Session() as sess:
    boolean_cols = [c.name for c in db.RawMemberClaims.__table__.columns if isinstance(c.type, sqlalchemy.Boolean)]
    print(boolean_cols)
    q = sess.query(*[getattr(db.RawMemberClaims, c) for c in ['member_id', *boolean_cols]])
    rows = []
    try:
        for irow, row in tqdm(enumerate(q.yield_per(BATCH))):
            rows.append(row)
            # if irow >= 10000:
            #    break
    except:
        sess.rollback()
        traceback.print_exc()

['pcp_visit', 'annual_wellness', 'humana_paf', 'preventative_visit', 'comp_physical_exam', 'ihwa', 'fqhc_visit', 'telehealth', 'endocrinologist_visit', 'oncolologist_visit', 'radiologist_visit', 'podiatrist_visit', 'ophthalmologist_visit', 'optometrist_visit', 'physical_therapist_visit', 'cardiologist_visit', 'gastroenterologist_visit', 'orthopedist_visit', 'obgyn_visit', 'nephroloogist_visit', 'pulmonologist_visit', 'urgent_care_visit', 'er_visit']


0it [00:00, ?it/s]

In [9]:
df = pd.DataFrame(rows, columns=['member_id', *boolean_cols])
display(df)
df = df.fillna(0)

Unnamed: 0,member_id,pcp_visit,annual_wellness,humana_paf,preventative_visit,comp_physical_exam,ihwa,fqhc_visit,telehealth,endocrinologist_visit,...,optometrist_visit,physical_therapist_visit,cardiologist_visit,gastroenterologist_visit,orthopedist_visit,obgyn_visit,nephroloogist_visit,pulmonologist_visit,urgent_care_visit,er_visit
0,733133,,,,,,,,,,...,,,,,,,,,,
1,1296887,,,,,,,,,,...,,,,,,,,,,
2,1331806,,,,,,,,,,...,,,,,,,,,,
3,1548280,,,,,,,,,,...,,,,,,,,,,
4,689644,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24314067,295810,,,,,,,,,,...,,,,,,,,,,
24314068,1825873,,,,,,,,,,...,,,,,,,,,,
24314069,957758,,,,,,,,,,...,,,,,,,,,,
24314070,1379188,,,,,,,,,,...,,,,,,,,,,


  df = df.fillna(0)


In [10]:
df_member_claim = df.groupby('member_id').sum().reset_index()
df_member_claim

Unnamed: 0,member_id,pcp_visit,annual_wellness,humana_paf,preventative_visit,comp_physical_exam,ihwa,fqhc_visit,telehealth,endocrinologist_visit,...,optometrist_visit,physical_therapist_visit,cardiologist_visit,gastroenterologist_visit,orthopedist_visit,obgyn_visit,nephroloogist_visit,pulmonologist_visit,urgent_care_visit,er_visit
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1661459,1999996,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1661460,1999997,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1661461,1999998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1661462,1999999,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
with open("out/compressed_member_claim_df.pkl", 'wb') as f:
    pickle.dump(df_member_claim, f)

## Member Condition

In [13]:
interestcols = ['member_id', 'hcc_model_type', 'cms_model_vers_cd']

In [14]:
with Session() as sess:
    q = sess.query(*[getattr(db.RawMemberCondition, c) for c in interestcols])
    rows = []
    try:
        for irow, row in tqdm(enumerate(q.yield_per(BATCH))):
            rows.append(row)
            #if irow >= 10000:
            #    break
    except:
        sess.rollback()
        traceback.print_exc()

0it [00:00, ?it/s]

In [15]:
df = pd.DataFrame(rows, columns=interestcols)
df

Unnamed: 0,member_id,hcc_model_type,cms_model_vers_cd
0,993642,EHccModelType.Medical,ECMSModelVers.V28
1,993642,EHccModelType.Medical,ECMSModelVers.V28
2,993642,EHccModelType.Medical,ECMSModelVers.V28
3,803124,EHccModelType.Medical,ECMSModelVers.V28
4,803124,EHccModelType.Medical,ECMSModelVers.V24
...,...,...,...
5011109,1631128,EHccModelType.Medical,ECMSModelVers.V28
5011110,787802,EHccModelType.Medical,ECMSModelVers.V28
5011111,787802,EHccModelType.Medical,ECMSModelVers.V28
5011112,365181,EHccModelType.Medical,ECMSModelVers.V28


In [21]:
df[df['hcc_model_type'] == db.EHccModelType.Medical]

Unnamed: 0,member_id,hcc_model_type,cms_model_vers_cd
0,993642,EHccModelType.Medical,ECMSModelVers.V28
1,993642,EHccModelType.Medical,ECMSModelVers.V28
2,993642,EHccModelType.Medical,ECMSModelVers.V28
3,803124,EHccModelType.Medical,ECMSModelVers.V28
4,803124,EHccModelType.Medical,ECMSModelVers.V24
...,...,...,...
5011109,1631128,EHccModelType.Medical,ECMSModelVers.V28
5011110,787802,EHccModelType.Medical,ECMSModelVers.V28
5011111,787802,EHccModelType.Medical,ECMSModelVers.V28
5011112,365181,EHccModelType.Medical,ECMSModelVers.V28


In [29]:
len(df[df['hcc_model_type'] == db.EHccModelType.ESRD]) / len(df[df['hcc_model_type'] == db.EHccModelType.Medical])

0.025990275119247414

In [23]:
df.dropna()

Unnamed: 0,member_id,hcc_model_type,cms_model_vers_cd
0,993642,EHccModelType.Medical,ECMSModelVers.V28
1,993642,EHccModelType.Medical,ECMSModelVers.V28
2,993642,EHccModelType.Medical,ECMSModelVers.V28
3,803124,EHccModelType.Medical,ECMSModelVers.V28
4,803124,EHccModelType.Medical,ECMSModelVers.V24
...,...,...,...
5011109,1631128,EHccModelType.Medical,ECMSModelVers.V28
5011110,787802,EHccModelType.Medical,ECMSModelVers.V28
5011111,787802,EHccModelType.Medical,ECMSModelVers.V28
5011112,365181,EHccModelType.Medical,ECMSModelVers.V28


In [24]:
df.groupby('member_id').count()

Unnamed: 0_level_0,hcc_model_type,cms_model_vers_cd
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7,7
2,5,5
3,2,2
4,1,1
5,3,3
...,...,...
1999994,2,2
1999995,2,2
1999997,5,5
1999998,5,5


In [25]:
df[df['cms_model_vers_cd'] == db.ECMSModelVers.V28]

Unnamed: 0,member_id,hcc_model_type,cms_model_vers_cd
0,993642,EHccModelType.Medical,ECMSModelVers.V28
1,993642,EHccModelType.Medical,ECMSModelVers.V28
2,993642,EHccModelType.Medical,ECMSModelVers.V28
3,803124,EHccModelType.Medical,ECMSModelVers.V28
5,842597,EHccModelType.Medical,ECMSModelVers.V28
...,...,...,...
5011106,664230,EHccModelType.Medical,ECMSModelVers.V28
5011109,1631128,EHccModelType.Medical,ECMSModelVers.V28
5011110,787802,EHccModelType.Medical,ECMSModelVers.V28
5011111,787802,EHccModelType.Medical,ECMSModelVers.V28


In [28]:
len(df[df['cms_model_vers_cd'] == db.ECMSModelVers.V24]) / len(df[df['cms_model_vers_cd'] == db.ECMSModelVers.V28])

0.30914094510981277

In [33]:
dummies = pd.get_dummies(df['cms_model_vers_cd']).astype(int)
df[[f'cms_model_vers_cd_{i}' for i in range(2)]] = dummies
dummies = pd.get_dummies(df['hcc_model_type']).astype(int)
df[[f'hcc_model_type_{i}' for i in range(2)]] = dummies
df

Unnamed: 0,member_id,hcc_model_type,cms_model_vers_cd,cms_model_vers_cd_0,cms_model_vers_cd_1,hcc_model_type_0,hcc_model_type_1
0,993642,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0
1,993642,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0
2,993642,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0
3,803124,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0
4,803124,EHccModelType.Medical,ECMSModelVers.V24,0,1,1,0
...,...,...,...,...,...,...,...
5011109,1631128,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0
5011110,787802,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0
5011111,787802,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0
5011112,365181,EHccModelType.Medical,ECMSModelVers.V28,1,0,1,0


In [35]:
df_member_condition = df[['member_id'] + [f'cms_model_vers_cd_{i}' for i in range(2)] + [f'hcc_model_type_{i}' for i in range(2)]]
df_member_condition = df_member_condition.groupby('member_id').mean().reset_index()
df_member_condition

Unnamed: 0,member_id,cms_model_vers_cd_0,cms_model_vers_cd_1,hcc_model_type_0,hcc_model_type_1
0,1,0.857143,0.142857,1.0,0.0
1,2,0.600000,0.400000,1.0,0.0
2,3,0.500000,0.500000,1.0,0.0
3,4,1.000000,0.000000,1.0,0.0
4,5,1.000000,0.000000,1.0,0.0
...,...,...,...,...,...
1440321,1999994,1.000000,0.000000,1.0,0.0
1440322,1999995,1.000000,0.000000,1.0,0.0
1440323,1999997,0.800000,0.200000,1.0,0.0
1440324,1999998,0.800000,0.200000,1.0,0.0


In [36]:
with open("out/compressed_member_condition_df.pkl", 'wb') as f:
    pickle.dump(df_member_condition, f)

# Merge CSVs

In [39]:
with open("out/compressed_quality_data_df.pkl", 'rb') as f:
    df_quality_data = pickle.load(f)
with open("out/compressed_member_claim_df.pkl", 'rb') as f:
    df_member_claim = pickle.load(f)
with open("out/compressed_member_condition_df.pkl", 'rb') as f:
    df_member_condition = pickle.load(f)
gc.collect()

6991

In [43]:
merged_newdata = (df_quality_data
     .merge(df_member_claim, how='left', on='member_id')
     .merge(df_member_condition, how='left', on='member_id'))
merged_newdata

Unnamed: 0,member_id,measure_type_0_compliant_cnt,measure_type_0_eligible_cnt,measure_type_0_compliant_to_eligible_ratio,measure_type_1_compliant_cnt,measure_type_1_eligible_cnt,measure_type_1_compliant_to_eligible_ratio,measure_type_2_compliant_cnt,measure_type_2_eligible_cnt,measure_type_2_compliant_to_eligible_ratio,...,orthopedist_visit,obgyn_visit,nephroloogist_visit,pulmonologist_visit,urgent_care_visit,er_visit,cms_model_vers_cd_0,cms_model_vers_cd_1,hcc_model_type_0,hcc_model_type_1
0,40240,6.0,6.0,1.000000,0.0,0.0,0.000000,99.00,114.0,0.868421,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0
1,193657,0.0,0.0,0.000000,2.0,4.0,0.500000,0.00,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,1685072,5.0,6.0,0.833333,0.0,0.0,0.000000,37.32,38.0,0.982105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.333333,1.0,0.0
3,939385,3.0,3.0,1.000000,2.0,6.0,0.333333,0.00,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,0.500000,1.0,0.0
4,1631297,2.0,4.0,0.500000,1.0,1.0,1.000000,0.00,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,0.500000,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687944,1118494,0.0,0.0,0.000000,0.0,0.0,0.000000,2.00,10.0,0.200000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0
1687945,831662,0.0,0.0,0.000000,0.0,0.0,0.000000,2.00,6.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
1687946,1726589,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00,3.0,0.000000,...,,,,,,,1.000000,0.000000,1.0,0.0
1687947,1826208,0.0,0.0,0.000000,0.0,0.0,0.000000,6.00,6.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,


In [44]:
merged_newdata.columns

Index(['member_id', 'measure_type_0_compliant_cnt',
       'measure_type_0_eligible_cnt',
       'measure_type_0_compliant_to_eligible_ratio',
       'measure_type_1_compliant_cnt', 'measure_type_1_eligible_cnt',
       'measure_type_1_compliant_to_eligible_ratio',
       'measure_type_2_compliant_cnt', 'measure_type_2_eligible_cnt',
       'measure_type_2_compliant_to_eligible_ratio', 'pcp_visit',
       'annual_wellness', 'humana_paf', 'preventative_visit',
       'comp_physical_exam', 'ihwa', 'fqhc_visit', 'telehealth',
       'endocrinologist_visit', 'oncolologist_visit', 'radiologist_visit',
       'podiatrist_visit', 'ophthalmologist_visit', 'optometrist_visit',
       'physical_therapist_visit', 'cardiologist_visit',
       'gastroenterologist_visit', 'orthopedist_visit', 'obgyn_visit',
       'nephroloogist_visit', 'pulmonologist_visit', 'urgent_care_visit',
       'er_visit', 'cms_model_vers_cd_0', 'cms_model_vers_cd_1',
       'hcc_model_type_0', 'hcc_model_type_1'],
      dt

In [45]:
dump_df = pd.read_csv("out/dump20241004.csv")

  dump_df = pd.read_csv("out/dump20241004.csv")


In [46]:
dump_df

Unnamed: 0,id,data_type,calendar_year,product_type,plan_category,preventive_visit_gap_ind,cci_score,dcsi_score,fci_score,cms_tot_partd_payment_amt,...,generic_grouper,unattributed_provider,sex_cd,age,veteran_ind,mco_contract_nbr,plan_benefit_package_id,state_of_residence,county_of_residence,race
0,380667,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,3.0,0.0,4.0,96.02,...,,,ESex.F,66,,EMcoContractNbr.H5216,58,EState.NH,CARROLL,ERace.White
1,378860,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,2.0,0.0,1.0,76.15,...,,,ESex.F,68,,EMcoContractNbr.H5216,1,EState.WI,WAUKESHA,
2,1370287,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,True,2.0,0.0,0.0,143.53,...,,,ESex.F,67,,EMcoContractNbr.H5216,317,EState.KY,LOGAN,ERace.White
3,184540,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,12.0,3.0,6.0,99.28,...,,,ESex.F,81,,EMcoContractNbr.H5216,805,EState.NC,GUILFORD,
4,1628837,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,3.0,0.0,6.0,138.20,...,,,ESex.F,79,,EMcoContractNbr.H9070,6,EState.OK,OKLAHOMA,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1909875,1691098,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,6.0,2.0,4.0,146.24,...,,,ESex.M,92,,EMcoContractNbr.H5525,51,EState.PA,BUCKS,
1909876,63411,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,5.0,1.0,1.0,75.81,...,,,ESex.M,78,,EMcoContractNbr.H5216,311,EState.FL,ST. LUCIE,ERace.Other
1909877,1464366,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,2.0,1.0,1.0,271.70,...,,,ESex.M,70,,EMcoContractNbr.H5970,24,EState.NY,NEW YORK,ERace.Other
1909878,715653,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,3.0,0.0,2.0,,...,,,ESex.F,63,,EMcoContractNbr.H5216,318,EState.KS,RILEY,


In [48]:
dump_merged = dump_df.merge(merged_newdata.rename({'member_id': 'id'}, axis='columns'), how='left', on='id')
dump_merged

Unnamed: 0,id,data_type,calendar_year,product_type,plan_category,preventive_visit_gap_ind,cci_score,dcsi_score,fci_score,cms_tot_partd_payment_amt,...,orthopedist_visit,obgyn_visit,nephroloogist_visit,pulmonologist_visit,urgent_care_visit,er_visit,cms_model_vers_cd_0,cms_model_vers_cd_1,hcc_model_type_0,hcc_model_type_1
0,380667,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,3.0,0.0,4.0,96.02,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
1,378860,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,2.0,0.0,1.0,76.15,...,,,,,,,,,,
2,1370287,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,True,2.0,0.0,0.0,143.53,...,,,,,,,,,,
3,184540,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,12.0,3.0,6.0,99.28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.888889,0.111111,1.0,0.0
4,1628837,EType.Training,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,False,3.0,0.0,6.0,138.20,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1909875,1691098,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,6.0,2.0,4.0,146.24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,0.500000,1.0,0.0
1909876,63411,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,5.0,1.0,1.0,75.81,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000,1.0,0.0
1909877,1464366,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,2.0,1.0,1.0,271.70,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
1909878,715653,EType.Holdout,2023,EProductType.LPPO,EPlanCategory.MedicareAdvantage,,3.0,0.0,2.0,,...,,,,,,,,,,


In [49]:
dump_merged.to_csv("out/dump20241011.csv")