## Boundless dataset Summary V2

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

### Load the dataset

In [3]:
# List the sheet name
xls = pd.ExcelFile('OSU Data Sets.xlsx')
xls.sheet_names 

['Social Determinants Hotspotting',
 'High-Risk Client Prediction',
 ' Care Coordination Opportun (2)',
 ' Care Coordination Opportunity ',
 'Preventive Visit Gaps (2)',
 'Preventive Visit Gaps',
 'Primary care access equity',
 'Chronic Disease Follow-up Optim']

In [4]:
# Create the dictionary : [sheet(key) : dataframe(value)]
dfs = pd.read_excel('OSU Data Sets.xlsx',
                  sheet_name = ['Social Determinants Hotspotting',
 'High-Risk Client Prediction',
 ' Care Coordination Opportun (2)',
 ' Care Coordination Opportunity ',
 'Preventive Visit Gaps (2)',
 'Preventive Visit Gaps',
 'Primary care access equity',
 'Chronic Disease Follow-up Optim'] )

social_hotspot = dfs['Social Determinants Hotspotting']
high_risk_prediction = dfs['High-Risk Client Prediction']
care_op2 = dfs[' Care Coordination Opportun (2)']
care_op = dfs[' Care Coordination Opportunity ']
visit_gap2 = dfs['Preventive Visit Gaps (2)']
visit_gap = dfs['Preventive Visit Gaps']
primary_equity = dfs['Primary care access equity']
chronic_follow_up = dfs['Chronic Disease Follow-up Optim']

### 1. Social Determinants Hotspotting

In [5]:
# getting the shape of Boundless dataset
print(f'This dataset consists of {social_hotspot.shape[0]} rows and {social_hotspot.shape[1]} columns')

This dataset consists of 20590 rows and 4 columns


In [6]:
# Display the column names
print('The Column list:')
print(social_hotspot.columns.tolist())

The Column list:
['Client ID', 'State', 'Housing Stability', 'Transportation Access']


In [18]:
# Dataset information
social_hotspot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20590 entries, 0 to 20589
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Client ID              20590 non-null  object
 1   State                  20524 non-null  object
 2   Housing Stability      20590 non-null  object
 3   Transportation Access  20590 non-null  object
dtypes: object(4)
memory usage: 643.6+ KB


In [7]:
# In Social Determinants Hotspotting, all variables are non-numerical variables
# Summary statistics for non-numerical variable
social_hotspot.describe()

Unnamed: 0,Client ID,State,Housing Stability,Transportation Access
count,20590,20524,20590,20590
unique,13494,20,2,2
top,5671C10B45254A2C3801483EB42B1582,OH,Y,Y
freq,3,20477,20532,20587


In [8]:
# Total sum. of duplicate client ID
social_hotspot["Client ID"].duplicated().sum()

np.int64(7096)

**Out of 20590 client ID, 7096 duplicate client ID exists in Social Determinants Hotspotting tab**

In [10]:
# list of duplicate
duplicates_only = social_hotspot['Client ID'].value_counts()[lambda x: x > 1]
print(duplicates_only)

Client ID
5671C10B45254A2C3801483EB42B1582    3
6739941CBF29485E7E1EB7011AE7F32A    3
3ABA9A30D126D02FC7AB57AFA228AAD0    3
4C4A8149322709CCAA77C10DF0DD0284    3
85092126D549E35FA6CC2C2AEC685348    3
                                   ..
4F6B340E78FE8033EFB61052BD7EF33C    2
65D1C6CF21D1C4B374AE4403660A494E    2
3C89307AA754E35CF08F7E4DA123DA27    2
9B680B0A1890DDBD0AE752B59B8F0CC1    2
FEB44A77DFF4A68535F77CA9967C11E2    2
Name: count, Length: 7090, dtype: int64


In [12]:
# Checking for missing values(null value)
c_null=social_hotspot.isnull().sum()
c_null[c_null>0]

State    66
dtype: int64

***In the state column, 66 null values are founded***

### 2. high_risk_prediction

In [40]:
# getting the shape of Boundless dataset
print(f'This dataset consists of {high_risk_prediction.shape[0]} rows and {high_risk_prediction.shape[1]} columns')

This dataset consists of 2128 rows and 5 columns


In [43]:
# Display the column names
print('The Column list:')
print(high_risk_prediction.columns.tolist())

The Column list:
['Client ID', 'Age Group', 'Count of Chronic Conditions', 'Count of Hospitalization Visits', 'Count of ED Visit']


In [41]:
# Dataset information
high_risk_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2128 entries, 0 to 2127
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Client ID                        2128 non-null   object
 1   Age Group                        2128 non-null   object
 2   Count of Chronic Conditions      2128 non-null   int64 
 3   Count of Hospitalization Visits  2128 non-null   int64 
 4   Count of ED Visit                2128 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 83.2+ KB


In [46]:
# Summary statistics for numerical variable
high_risk_prediction.describe()

Unnamed: 0,Count of Chronic Conditions,Count of Hospitalization Visits,Count of ED Visit
count,2128.0,2128.0,2128.0
mean,7.948778,13.413534,2.643327
std,20.334086,27.477341,7.196689
min,0.0,0.0,0.0
25%,0.0,2.0,0.0
50%,1.0,6.0,0.0
75%,8.0,14.0,2.0
max,324.0,485.0,122.0


In [17]:
# Summary statistics for non-numerical variable
high_risk_prediction.describe(include = 'object')

Unnamed: 0,Client ID,Age Group
count,2128,2128
unique,1234,5
top,D2450E775D264802F1840A16DF41064A,Under 18
freq,5,714


In [48]:
# Total sum. of duplicate client ID
high_risk_prediction["Client ID"].duplicated().sum()

np.int64(894)

**Out of 20590 client ID, 894 duplicate were found in this tab**

In [19]:
# Checking for missing values(null value)
h_null=high_risk_prediction.isnull().sum()
h_null[h_null>0]

Series([], dtype: int64)

***There is no null value***

### 3. Care Coordination Opportun (2)

In [20]:
# getting the shape of Boundless dataset
print(f'This dataset consists of {care_op2.shape[0]} rows and {care_op2.shape[1]} columns')

This dataset consists of 51369 rows and 13 columns


In [21]:
# Display the column names
print('The Column list:')
print(care_op2.columns.tolist())


The Column list:
['client_id', 'AgeGroup', 'ProgramACount', 'ProgramBCount', 'ProgramCCount', 'ProgramDCount', 'ProgramECount', 'ProgramFCount', 'ProgramGCount', 'ProgramHCount', 'ProgramICount', 'ProgramJCount', 'LastServiceDate']


In [22]:
# Dataset information
care_op2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51369 entries, 0 to 51368
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   client_id        51369 non-null  object
 1   AgeGroup         51369 non-null  object
 2   ProgramACount    51369 non-null  int64 
 3   ProgramBCount    51369 non-null  int64 
 4   ProgramCCount    51369 non-null  int64 
 5   ProgramDCount    51369 non-null  int64 
 6   ProgramECount    51369 non-null  int64 
 7   ProgramFCount    51369 non-null  int64 
 8   ProgramGCount    51369 non-null  int64 
 9   ProgramHCount    51369 non-null  int64 
 10  ProgramICount    51369 non-null  int64 
 11  ProgramJCount    51369 non-null  int64 
 12  LastServiceDate  51369 non-null  object
dtypes: int64(10), object(3)
memory usage: 5.1+ MB


In [23]:
# Summary statistics for numerical variable
care_op2.describe()

Unnamed: 0,ProgramACount,ProgramBCount,ProgramCCount,ProgramDCount,ProgramECount,ProgramFCount,ProgramGCount,ProgramHCount,ProgramICount,ProgramJCount
count,51369.0,51369.0,51369.0,51369.0,51369.0,51369.0,51369.0,51369.0,51369.0,51369.0
mean,3.393272,0.588331,0.081781,0.017287,0.00438,0.001207,0.000701,0.000389,0.000292,0.000273
std,19.544574,5.062565,0.956655,0.361224,0.164499,0.096659,0.076418,0.062708,0.049329,0.045425
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,791.0,199.0,75.0,30.0,19.0,17.0,14.0,11.0,10.0,9.0


In [24]:
# Summary statistics for non-numerical variable
care_op2.describe(include = 'object')

Unnamed: 0,client_id,AgeGroup,LastServiceDate
count,51369,51369,51369
unique,51369,6,6
top,000115FC0570FC69983D6AFD07BD138E,Under 18,No visits
freq,1,25299,34245


In [25]:
# Total sum. of duplicate client ID
care_op2["client_id"].duplicated().sum()

np.int64(0)

***In thie tab, there is no duplicate client ID***

In [45]:
# Checking for missing values(null value)
c_null=care_op2.isnull().sum()
c_null[c_null>0]

Series([], dtype: int64)

***In thie tab, there is no null value***

### 4. Preventive Visit Gaps (2)

In [27]:
# getting the shape of Boundless dataset
print(f'This dataset consists of {visit_gap2.shape[0]} rows and {visit_gap2.shape[1]} columns')

This dataset consists of 2379 rows and 5 columns


In [28]:
# Display the column names
print('The Column list:')
print(visit_gap2.columns.tolist())

The Column list:
['client_id', 'AgeGroup', 'LastAnnualVisit', 'countOfChronicCondition', 'upcomingvisit']


In [60]:
# Dataset information
visit_gap2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2379 entries, 0 to 2378
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   client_id                2379 non-null   object
 1   AgeGroup                 2379 non-null   object
 2   LastAnnualVisit          2379 non-null   object
 3   countOfChronicCondition  2379 non-null   int64 
 4   upcomingvisit            2379 non-null   object
dtypes: int64(1), object(4)
memory usage: 93.1+ KB


In [61]:
# Summary statistics for non-numerical variable
visit_gap2.describe(include = 'object')

Unnamed: 0,client_id,AgeGroup,LastAnnualVisit,upcomingvisit
count,2379,2379,2379,2379
unique,2358,6,2,2
top,B24DD235E72A894898230CE5EE49183E,22–35,Y,N
freq,2,740,2122,2192


In [62]:
# Summary statistics for numerical variable
visit_gap2.describe()

Unnamed: 0,countOfChronicCondition
count,2379.0
mean,0.383775
std,1.310919
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,18.0


In [63]:
# Total sum. of duplicate client ID
visit_gap2["client_id"].duplicated().sum()

np.int64(21)

***In thie tab, out of 2379 client ID, 21 duplicate are founded***

In [46]:
# Checking for missing values(null value)
v_null=visit_gap2.isnull().sum()
v_null[v_null>0]

Series([], dtype: int64)

***In thie tab, there is no null value***

### 5. Primary care access equity

In [37]:
# getting the shape of Boundless dataset
print(f'This dataset consists of {primary_equity.shape[0]} rows and {primary_equity.shape[1]} columns')

This dataset consists of 1711 rows and 5 columns
The Column list:
['client_id', 'AgeGroup', 'State', 'Visit Count with PCP', 'InsuranceStatus']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1711 entries, 0 to 1710
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   client_id             1711 non-null   object
 1   AgeGroup              1701 non-null   object
 2   State                 1711 non-null   object
 3   Visit Count with PCP  1711 non-null   int64 
 4   InsuranceStatus       1711 non-null   object
dtypes: int64(1), object(4)
memory usage: 67.0+ KB


np.int64(12)

In [54]:
# Display the column names
print('The Column list:')
print(primary_equity.columns.tolist())

The Column list:
['client_id', 'AgeGroup', 'State', 'Visit Count with PCP', 'InsuranceStatus']


In [55]:
# Dataset information
primary_equity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1711 entries, 0 to 1710
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   client_id             1711 non-null   object
 1   AgeGroup              1701 non-null   object
 2   State                 1711 non-null   object
 3   Visit Count with PCP  1711 non-null   int64 
 4   InsuranceStatus       1711 non-null   object
dtypes: int64(1), object(4)
memory usage: 67.0+ KB


In [56]:
# Summary statistics for numerical variable
primary_equity.describe()

Unnamed: 0,Visit Count with PCP
count,1711.0
mean,10.779661
std,16.074897
min,1.0
25%,2.0
50%,5.0
75%,14.0
max,336.0


In [57]:
# Summary statistics for non-numerical variable
primary_equity.describe(include = 'object')

Unnamed: 0,client_id,AgeGroup,State,InsuranceStatus
count,1711,1701,1711,1711
unique,1699,10,8,2
top,5671C10B45254A2C3801483EB42B1582,OH,OH,Insured
freq,2,1146,530,1707


In [58]:
# Total sum. of duplicate client ID
primary_equity["client_id"].duplicated().sum()

np.int64(12)

**Out of 1711 client Id, 12 duplicate are founded**

In [32]:
# Checking for missing values(null value)
p_null=primary_equity.isnull().sum()
p_null[p_null>0]

AgeGroup    10
dtype: int64

**10 null values are founded in Agegroup column**

### 6. Chronic Disease Follow-up Optim

In [39]:
# getting the shape of Boundless dataset
print(f'This dataset consists of {chronic_follow_up.shape[0]} rows and {chronic_follow_up.shape[1]} columns')


This dataset consists of 1149 rows and 6 columns
The Column list:
['Client ID', 'Age Group', 'Provider ID', 'Last Follow-Up Date', 'Visit Frequency Days (2025)', 'Has Chronic Condition']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149 entries, 0 to 1148
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Client ID                    1149 non-null   object
 1   Age Group                    1149 non-null   object
 2   Provider ID                  1098 non-null   object
 3   Last Follow-Up Date          1149 non-null   object
 4   Visit Frequency Days (2025)  1149 non-null   int64 
 5   Has Chronic Condition        1149 non-null   object
dtypes: int64(1), object(5)
memory usage: 54.0+ KB


np.int64(302)

In [49]:
# Display the column names
print('The Column list:')
print(chronic_follow_up.columns.tolist())

The Column list:
['Client ID', 'Age Group', 'Provider ID', 'Last Follow-Up Date', 'Visit Frequency Days (2025)', 'Has Chronic Condition']


In [50]:
# Dataset information
chronic_follow_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149 entries, 0 to 1148
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Client ID                    1149 non-null   object
 1   Age Group                    1149 non-null   object
 2   Provider ID                  1098 non-null   object
 3   Last Follow-Up Date          1149 non-null   object
 4   Visit Frequency Days (2025)  1149 non-null   int64 
 5   Has Chronic Condition        1149 non-null   object
dtypes: int64(1), object(5)
memory usage: 54.0+ KB


In [51]:
# Summary statistics for numerical variable
chronic_follow_up.describe()

Unnamed: 0,Visit Frequency Days (2025)
count,1149.0
mean,6.395126
std,14.464682
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,160.0


In [52]:
# Summary statistics for non-numerical variable
chronic_follow_up.describe(include = 'object')

Unnamed: 0,Client ID,Age Group,Provider ID,Last Follow-Up Date,Has Chronic Condition
count,1149,1149,1098,1149,1149
unique,847,5,22,2,2
top,A637F0E042D24FE6C3C3D85BEBF01BFA,Over 45,8AC43464663F94DD38239FB5F17C3B2E,Y,N
freq,7,413,460,964,1139


In [53]:
# Total sum. of duplicate client ID
chronic_follow_up["Client ID"].duplicated().sum()

np.int64(302)

**Out of 1149 client Id, 302 duplicate are founded**

In [33]:
# Checking for missing values(null value)
f_null=chronic_follow_up.isnull().sum()
f_null[f_null>0]

Provider ID    51
dtype: int64

**51 null values are founded in Provider ID**

### 7. High_Risk Client Prediction NEW

In [38]:
# Load the data
hc_new = pd.read_csv("High_Risk Client Prediction NEW.csv")

In [39]:
# getting the shape of the dataset
print(f'This dataset consists of {hc_new.shape[0]} rows and {hc_new.shape[1]} columns')

This dataset consists of 1489 rows and 5 columns


In [40]:
# Display the column names
print('The Column list:')
print(hc_new.columns.tolist())

The Column list:
['Client ID', 'Age Group', 'Count of Chronic Conditions', 'Count of Hospitalization Visits', 'Count of ED Visit']


In [41]:
# Dataset information
hc_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1489 entries, 0 to 1488
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Client ID                        1489 non-null   object
 1   Age Group                        1489 non-null   object
 2   Count of Chronic Conditions      1489 non-null   int64 
 3   Count of Hospitalization Visits  1489 non-null   int64 
 4   Count of ED Visit                1489 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 58.3+ KB


In [42]:
# Summary statistics for numerical variable
hc_new.describe()

Unnamed: 0,Count of Chronic Conditions,Count of Hospitalization Visits,Count of ED Visit
count,1489.0,1489.0,1489.0
mean,1.726662,8.303559,1.038952
std,0.843919,13.907016,2.778777
min,1.0,1.0,0.0
25%,1.0,2.0,0.0
50%,2.0,4.0,0.0
75%,2.0,9.0,1.0
max,6.0,246.0,61.0


In [43]:
# Summary statistics for non-numerical variable
hc_new.describe(include = 'object')

Unnamed: 0,Client ID,Age Group
count,1489,1489
unique,1489,5
top,001036E8EDD1937665BAAAD27D9569F3,Under 18
freq,1,580


This dataset has **unique client IDs** as 1489 out of 1489 counts are unique.

In [44]:
# Checking for missing values(null value)
hc_null=hc_new.isnull().sum()
hc_null[hc_null>0]

Series([], dtype: int64)

***In thie tab, there is no null value***