Notebook này chạy trên local.

# Healthcare Dataset (2019-2024)

## Exploring Data

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [2]:
from opendatasets import download_kaggle_dataset

In [3]:
dataset_url = 'https://www.kaggle.com/datasets/muhammadehsan000/healthcare-dataset-2019-2024'
data_dir = '.'
download_kaggle_dataset(dataset_url, data_dir)

Skipping, found downloaded files in ".\healthcare-dataset-2019-2024" (use force=True to force download)


In [4]:
import pandas as pd
raw_df = pd.read_csv('healthcare-dataset-2019-2024/healthcare_dataset.csv')

In [5]:
raw_df

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.782410,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55495,eLIZABeTH jaCkSOn,42,Female,O+,Asthma,2020-08-16,Joshua Jarvis,Jones-Thompson,Blue Cross,2650.714952,417,Elective,2020-09-15,Penicillin,Abnormal
55496,KYle pEREz,61,Female,AB-,Obesity,2020-01-23,Taylor Sullivan,Tucker-Moyer,Cigna,31457.797307,316,Elective,2020-02-01,Aspirin,Normal
55497,HEATher WaNG,38,Female,B+,Hypertension,2020-07-13,Joe Jacobs DVM,"and Mahoney Johnson Vasquez,",UnitedHealthcare,27620.764717,347,Urgent,2020-08-10,Ibuprofen,Abnormal
55498,JENniFER JOneS,43,Male,O-,Arthritis,2019-05-25,Kimberly Curry,"Jackson Todd and Castro,",Medicare,32451.092358,321,Elective,2019-05-31,Ibuprofen,Abnormal


In [6]:
raw_df.isna().sum()

Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64

In [7]:
raw_df['Test Results'].value_counts()

Test Results
Abnormal        18627
Normal          18517
Inconclusive    18356
Name: count, dtype: int64

### Age

In [8]:
raw_df['Age'].describe()

count    55500.000000
mean        51.539459
std         19.602454
min         13.000000
25%         35.000000
50%         52.000000
75%         68.000000
max         89.000000
Name: Age, dtype: float64

In [9]:
raw_df['AgeGroup'] = pd.cut(raw_df['Age'],
                            bins=[-1,14,18,30,60,100],
                            labels=['Child','Teenager','Adult','OldAdult','Old'])

In [10]:
raw_df['AgeGroup'].value_counts()

AgeGroup
OldAdult    24631
Old         20370
Adult        9611
Teenager      856
Child          32
Name: count, dtype: int64

In [11]:
raw_df.drop(columns='Age', inplace=True)

In [12]:
def view_accurary(columns: str):
    total = raw_df[columns].value_counts()
    abnormal = raw_df[raw_df['Test Results']=='Abnormal'][columns].value_counts()
    normal = raw_df[raw_df['Test Results']=='Normal'][columns].value_counts()
    inconclusive = raw_df[raw_df['Test Results']=='Inconclusive'][columns].value_counts()
    abnormal_percent = abnormal/total*100
    normal_percent = normal/total*100
    inconclusive_percent = inconclusive/total*100
    return abnormal_percent, normal_percent, inconclusive_percent

In [13]:
view_accurary('AgeGroup')

(AgeGroup
 OldAdult    33.681134
 Old         33.637703
 Adult       33.180730
 Teenager    32.359813
 Child       40.625000
 Name: count, dtype: float64,
 AgeGroup
 OldAdult    33.177703
 Old         33.087874
 Adult       34.231610
 Teenager    35.514019
 Child       34.375000
 Name: count, dtype: float64,
 AgeGroup
 OldAdult    33.141164
 Old         33.274423
 Adult       32.587660
 Teenager    32.126168
 Child       25.000000
 Name: count, dtype: float64)

In [14]:
low_feature_cols = []
low_feature_cols.append('AgeGroup')

### Gender

In [15]:
raw_df['Gender'].value_counts()

Gender
Male      27774
Female    27726
Name: count, dtype: int64

In [16]:
view_accurary('Gender')

(Gender
 Female    33.679579
 Male      33.444949
 Name: count, dtype: float64,
 Gender
 Male      33.646576
 Female    33.080863
 Name: count, dtype: float64,
 Gender
 Female    33.239559
 Male      32.908476
 Name: count, dtype: float64)

In [17]:
low_feature_cols.append('Gender')

### Blood Type

In [18]:
raw_df['Blood Type'].value_counts()

Blood Type
A-     6969
A+     6956
AB+    6947
AB-    6945
B+     6945
B-     6944
O+     6917
O-     6877
Name: count, dtype: int64

In [19]:
view_accurary('Blood Type')

(Blood Type
 A+     33.539390
 A-     33.519874
 AB+    33.222974
 AB-    33.592513
 B+     33.160547
 B-     33.813364
 O+     33.930895
 O-     33.721099
 Name: count, dtype: float64,
 Blood Type
 A+     32.964347
 A-     33.505524
 AB+    33.309342
 AB-    34.010079
 B+     32.771778
 B-     33.525346
 O+     33.367067
 O-     33.459357
 Name: count, dtype: float64,
 Blood Type
 A+     33.496262
 A-     32.974602
 AB+    33.467684
 AB-    32.397408
 B+     34.067675
 B-     32.661290
 O+     32.702038
 O-     32.819543
 Name: count, dtype: float64)

In [20]:
low_feature_cols.append('Blood Type')

### Medical Condition

In [21]:
raw_df['Medical Condition'].value_counts()

Medical Condition
Arthritis       9308
Diabetes        9304
Hypertension    9245
Obesity         9231
Cancer          9227
Asthma          9185
Name: count, dtype: int64

In [22]:
view_accurary('Medical Condition')

(Medical Condition
 Arthritis       34.250107
 Asthma          32.759935
 Cancer          33.792132
 Diabetes        34.049871
 Hypertension    32.579773
 Obesity         33.929152
 Name: count, dtype: float64,
 Medical Condition
 Arthritis       32.574130
 Asthma          34.262384
 Cancer          33.044326
 Diabetes        33.211522
 Hypertension    33.985938
 Obesity         33.116672
 Name: count, dtype: float64,
 Medical Condition
 Arthritis       33.175763
 Asthma          32.977681
 Cancer          33.163542
 Diabetes        32.738607
 Hypertension    33.434289
 Obesity         32.954176
 Name: count, dtype: float64)

In [23]:
low_feature_cols.append('Medical Condition')

### Doctor

In [24]:
raw_df['Doctor'].value_counts()

Doctor
Michael Smith           27
Robert Smith            22
John Smith              22
Michael Johnson         20
James Smith             20
                        ..
Audrey Zimmerman DDS     1
Justin Banks             1
Joseph Williams Jr.      1
Jason Franklin           1
Jeffrey Moore            1
Name: count, Length: 40341, dtype: int64

In [25]:
doctors = raw_df['Doctor'].value_counts()
handle_many_doctors = []
for doctor in doctors.index:
    if doctors.loc[doctor] >= 5:
        handle_many_doctors.append(doctor)

In [26]:
len(handle_many_doctors)

576

In [27]:
raw_df['Is Handle Many Doctor'] = raw_df['Doctor'].apply(lambda x: 'Yes' if x in handle_many_doctors else 'No')

In [28]:
raw_df['Is Handle Many Doctor'].value_counts()

Is Handle Many Doctor
No     51598
Yes     3902
Name: count, dtype: int64

In [29]:
view_accurary('Is Handle Many Doctor')

(Is Handle Many Doctor
 No     33.536184
 Yes    33.905689
 Name: count, dtype: float64,
 Is Handle Many Doctor
 No     33.284236
 Yes    34.418247
 Name: count, dtype: float64,
 Is Handle Many Doctor
 No     33.179581
 Yes    31.676064
 Name: count, dtype: float64)

In [30]:
low_feature_cols.append('Is Handle Many Doctor')

In [31]:
raw_df.drop(columns='Doctor', inplace=True)

### Hospital

In [32]:
raw_df['Hospital'].value_counts()

Hospital
LLC Smith                      44
Ltd Smith                      39
Johnson PLC                    38
Smith Ltd                      37
Smith PLC                      36
                               ..
and Montoya Flores, Boyer       1
Carter and Dunn King,           1
Hall, Brown Black and           1
Peterson Scott and Thomas,      1
Moreno Murphy, Griffith and     1
Name: count, Length: 39876, dtype: int64

In [33]:
has_many_patients_hospital = []
hospitals = raw_df['Hospital'].value_counts()
for hospital in hospitals.index:
    if hospitals.loc[hospital] >= 4:
        has_many_patients_hospital.append(hospital)

In [34]:
raw_df['Is Many Patients Hospital'] = raw_df['Hospital'].apply(lambda x: 'Yes' if x in has_many_patients_hospital else 'No')

In [35]:
raw_df['Is Many Patients Hospital'].value_counts()

Is Many Patients Hospital
No     46449
Yes     9051
Name: count, dtype: int64

In [36]:
view_accurary('Is Many Patients Hospital')

(Is Many Patients Hospital
 No     33.453896
 Yes    34.117777
 Name: count, dtype: float64,
 Is Many Patients Hospital
 No     33.313957
 Yes    33.620594
 Name: count, dtype: float64,
 Is Many Patients Hospital
 No     33.232147
 Yes    32.261629
 Name: count, dtype: float64)

In [37]:
low_feature_cols.append('Is Many Patients Hospital')

In [38]:
raw_df.drop(columns='Hospital', inplace=True)

### Insurance Provider

In [39]:
raw_df['Insurance Provider'].value_counts()

Insurance Provider
Cigna               11249
Medicare            11154
UnitedHealthcare    11125
Blue Cross          11059
Aetna               10913
Name: count, dtype: int64

In [40]:
view_accurary('Insurance Provider')

(Insurance Provider
 Cigna               33.843008
 Medicare            33.925049
 UnitedHealthcare    33.860674
 Blue Cross          32.896284
 Aetna               33.272244
 Name: count, dtype: float64,
 Insurance Provider
 Aetna               33.189774
 Blue Cross          33.357446
 Cigna               33.105165
 Medicare            33.333333
 UnitedHealthcare    33.833708
 Name: count, dtype: float64,
 Insurance Provider
 Aetna               33.537982
 Blue Cross          33.746270
 Cigna               33.051827
 Medicare            32.741617
 UnitedHealthcare    32.305618
 Name: count, dtype: float64)

In [41]:
low_feature_cols.append('Insurance Provider')

### Room Number

In [42]:
raw_df['Room Number'].value_counts()

Room Number
393    181
491    177
104    175
420    175
209    171
      ... 
253    112
254    111
257    111
381    110
398    109
Name: count, Length: 400, dtype: int64

In [43]:
raw_df.drop(columns='Room Number', inplace=True)

### Admission Type

In [44]:
raw_df['Admission Type'].value_counts()

Admission Type
Elective     18655
Urgent       18576
Emergency    18269
Name: count, dtype: int64

In [45]:
view_accurary('Admission Type')

(Admission Type
 Elective     33.749665
 Urgent       33.591731
 Emergency    33.340632
 Name: count, dtype: float64,
 Admission Type
 Elective     33.454838
 Urgent       33.214901
 Emergency    33.422738
 Name: count, dtype: float64,
 Admission Type
 Elective     32.795497
 Emergency    33.236630
 Urgent       33.193368
 Name: count, dtype: float64)

In [46]:
low_feature_cols.append('Admission Type')

### Medication

In [47]:
raw_df['Medication'].value_counts()

Medication
Lipitor        11140
Ibuprofen      11127
Aspirin        11094
Paracetamol    11071
Penicillin     11068
Name: count, dtype: int64

In [48]:
view_accurary('Medication')

(Medication
 Aspirin        33.639805
 Ibuprofen      33.674845
 Lipitor        33.276481
 Paracetamol    33.754855
 Penicillin     33.465847
 Name: count, dtype: float64,
 Medication
 Aspirin        33.693889
 Ibuprofen      33.566999
 Lipitor        33.096948
 Paracetamol    32.996116
 Penicillin     33.465847
 Name: count, dtype: float64,
 Medication
 Aspirin        32.666306
 Ibuprofen      32.758156
 Lipitor        33.626571
 Paracetamol    33.249029
 Penicillin     33.068305
 Name: count, dtype: float64)

In [49]:
low_feature_cols.append('Medication')

### Num of Dates

In [50]:
raw_df[['Date of Admission', 'Discharge Date']]

Unnamed: 0,Date of Admission,Discharge Date
0,2024-01-31,2024-02-02
1,2019-08-20,2019-08-26
2,2022-09-22,2022-10-07
3,2020-11-18,2020-12-18
4,2022-09-19,2022-10-09
...,...,...
55495,2020-08-16,2020-09-15
55496,2020-01-23,2020-02-01
55497,2020-07-13,2020-08-10
55498,2019-05-25,2019-05-31


In [51]:
raw_df['Year'] = pd.to_datetime(raw_df['Discharge Date']).dt.year

In [52]:
raw_df['Num Of Days In'] = (pd.to_datetime(raw_df['Discharge Date']) - pd.to_datetime(raw_df['Date of Admission'])).dt.days

In [53]:
raw_df['Num Of Days In']

0         2
1         6
2        15
3        30
4        20
         ..
55495    30
55496     9
55497    28
55498     6
55499    27
Name: Num Of Days In, Length: 55500, dtype: int64

In [54]:
raw_df.drop(columns=['Date of Admission', 'Discharge Date'], inplace=True)

In [55]:
raw_df.drop(columns='Name', inplace=True)

In [56]:
raw_df

Unnamed: 0,Gender,Blood Type,Medical Condition,Insurance Provider,Billing Amount,Admission Type,Medication,Test Results,AgeGroup,Is Handle Many Doctor,Is Many Patients Hospital,Year,Num Of Days In
0,Male,B-,Cancer,Blue Cross,18856.281306,Urgent,Paracetamol,Normal,Adult,Yes,Yes,2024,2
1,Male,A+,Obesity,Medicare,33643.327287,Emergency,Ibuprofen,Inconclusive,Old,No,Yes,2019,6
2,Female,A-,Obesity,Aetna,27955.096079,Emergency,Aspirin,Normal,Old,No,Yes,2022,15
3,Female,O+,Diabetes,Medicare,37909.782410,Elective,Ibuprofen,Abnormal,Adult,No,No,2020,30
4,Female,AB+,Cancer,Aetna,14238.317814,Urgent,Penicillin,Abnormal,OldAdult,No,No,2022,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55495,Female,O+,Asthma,Blue Cross,2650.714952,Elective,Penicillin,Abnormal,OldAdult,No,No,2020,30
55496,Female,AB-,Obesity,Cigna,31457.797307,Elective,Aspirin,Normal,Old,No,No,2020,9
55497,Female,B+,Hypertension,UnitedHealthcare,27620.764717,Urgent,Ibuprofen,Abnormal,OldAdult,No,No,2020,28
55498,Male,O-,Arthritis,Medicare,32451.092358,Elective,Ibuprofen,Abnormal,OldAdult,No,No,2019,6


## Preprocessing

In [57]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [58]:
raw_df

Unnamed: 0,Gender,Blood Type,Medical Condition,Insurance Provider,Billing Amount,Admission Type,Medication,Test Results,AgeGroup,Is Handle Many Doctor,Is Many Patients Hospital,Year,Num Of Days In
0,Male,B-,Cancer,Blue Cross,18856.281306,Urgent,Paracetamol,Normal,Adult,Yes,Yes,2024,2
1,Male,A+,Obesity,Medicare,33643.327287,Emergency,Ibuprofen,Inconclusive,Old,No,Yes,2019,6
2,Female,A-,Obesity,Aetna,27955.096079,Emergency,Aspirin,Normal,Old,No,Yes,2022,15
3,Female,O+,Diabetes,Medicare,37909.782410,Elective,Ibuprofen,Abnormal,Adult,No,No,2020,30
4,Female,AB+,Cancer,Aetna,14238.317814,Urgent,Penicillin,Abnormal,OldAdult,No,No,2022,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55495,Female,O+,Asthma,Blue Cross,2650.714952,Elective,Penicillin,Abnormal,OldAdult,No,No,2020,30
55496,Female,AB-,Obesity,Cigna,31457.797307,Elective,Aspirin,Normal,Old,No,No,2020,9
55497,Female,B+,Hypertension,UnitedHealthcare,27620.764717,Urgent,Ibuprofen,Abnormal,OldAdult,No,No,2020,28
55498,Male,O-,Arthritis,Medicare,32451.092358,Elective,Ibuprofen,Abnormal,OldAdult,No,No,2019,6


In [59]:
numeric_cols = ['Billing Amount', 'Num Of Days In']
categorical_cols = ['Gender', 'Blood Type', 'Medical Condition',
                    'Insurance Provider', 'Admission Type',
                    'Medication', 'AgeGroup',
                    'Is Handle Many Doctor',
                    'Is Many Patients Hospital']

In [60]:
scaler = MinMaxScaler()

In [61]:
scaler.fit(raw_df[numeric_cols])

In [62]:
raw_df[numeric_cols] = scaler.transform(raw_df[numeric_cols])

In [63]:
raw_df[numeric_cols].describe()

Unnamed: 0,Billing Amount,Num Of Days In
count,55500.0,55500.0
mean,0.502947,0.500311
std,0.259462,0.298607
min,0.0,0.0
25%,0.278418,0.241379
50%,0.502924,0.482759
75%,0.727168,0.758621
max,1.0,1.0


In [64]:
from sklearn.preprocessing import LabelEncoder

In [65]:
for col in categorical_cols:
    encoder = LabelEncoder()
    encoder.fit(raw_df[col])
    raw_df[col] = encoder.transform(raw_df[col])

In [66]:
input_cols = numeric_cols + categorical_cols
target_col = 'Test Results'

In [67]:
raw_df[target_col] = raw_df[target_col].map({'Abnormal': 0,
                                             'Normal': 1,
                                             'Inconclusive': 2})

In [77]:
train_df = raw_df[raw_df['Year'] <= 2023]
val_df = raw_df[raw_df['Year'] > 2023]

In [78]:
train_inputs = train_df[input_cols]
train_targets = train_df[target_col]
val_inputs = val_df[input_cols]
val_targets = val_df[target_col]

In [79]:
train_df

Unnamed: 0,Gender,Blood Type,Medical Condition,Insurance Provider,Billing Amount,Admission Type,Medication,Test Results,AgeGroup,Is Handle Many Doctor,Is Many Patients Hospital,Year,Num Of Days In
1,1,0,5,3,0.650904,1,1,2,2,0,1,2019,0.172414
2,0,1,5,0,0.547053,1,0,1,2,0,1,2022,0.482759
3,0,6,3,3,0.728798,0,1,0,0,0,0,2020,1.000000
4,0,2,2,0,0.296622,2,4,0,3,0,0,2022,0.655172
5,1,0,1,4,0.915667,2,1,1,3,0,0,2023,0.103448
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55493,0,5,0,3,0.108874,1,0,0,3,0,1,2022,0.862069
55495,0,6,1,1,0.085064,0,4,0,3,0,0,2020,1.000000
55496,0,3,5,2,0.611002,0,0,1,2,0,0,2020,0.275862
55497,0,4,4,4,0.540949,2,1,0,3,0,0,2020,0.931034


## Model

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [81]:
classifier = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'XGB': XGBClassifier()
}
result = {'Classifier': [], 'Accurary': []}

In [82]:
from sklearn.metrics import accuracy_score

In [83]:
for clf_name, clf in classifier.items():
    clf.fit(train_inputs, train_targets)
    preds = clf.predict(val_inputs)
    acc = accuracy_score(val_targets, preds)
    result['Classifier'].append(clf_name)
    result['Accurary'].append(acc)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,Classifier,Accurary
0,LogisticRegression,0.321013
1,DecisionTree,0.32636
2,RandomForest,0.334031
3,XGB,0.335193


In [84]:
def test_params(**params):
    model = RandomForestClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc, model.feature_importances_

In [85]:
test_params(n_jobs=-1, random_state=42)

(1.0,
 0.34216643421664344,
 array([0.28484209, 0.18862952, 0.03037234, 0.1098848 , 0.09393767,
        0.07591021, 0.05221175, 0.07942636, 0.05442538, 0.01229711,
        0.01806276]))