In [1]:
import pandas as pd
import numpy as np
import pyforest
from summarytools import dfSummary

import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import pyforest
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings('ignore')


In [2]:
from     datetime   import datetime 

In [3]:
# Load the dataset
data=pd.read_csv('diabetic_data.csv')
pd.set_option('display.max_columns', 50)
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
data.fillna('None',inplace=True)

In [5]:
# The dataset contains null value as '?' so we have to replace it with np.nan value
data.replace({'?':np.nan},inplace=True)

In [6]:
df=data.copy()

# 1. Industry Review

# 2. Dataset and Domain

## 2.1. Data Dictionary

| Sl No | Variable Name	| Role | Type | Demographic | Description | Units | Missing Values | 
| ------  | ------  |   ------  |  ------  |  ------  |  ------  |  ------  |  ------  | 
| 1 | encounter_id | ID |  |   | Unique identifier of an encounter |  | no | 
| 2 | patient_nbr | ID |  |   | Unique identifier of a patient  |  | no | 
| 3 | race | Feature | Categorical | Race | Values: Caucasian, Asian, African American, Hispanic, and other  |  | yes |
| 4 | gender | Feature| Categorical| Gender| Values: male, female, and unknown/invalid  |  | no |
| 5 | age | Feature | Categorical | Age | Grouped in 10-year intervals: [0, 10), [10, 20),..., [90, 100)   |  | no | 
| 6 | weight| Feature| Categorical|  |  Weight in pounds.|  |  yes| 
| 7 | admission_type_id| Feature| Categorical |  |  Integer identifier corresponding to 9 distinct values, for example, emergency, urgent, elective, newborn, and not available |  | no | 
| 8 | discharge_disposition_id | Feature| Categorical |  | Integer identifier corresponding to 29 distinct values, for example, discharged to home, expired, and not available  | | no | 
| 9 | admission_source_id | Feature | Categorical |  | Integer identifier corresponding to 21 distinct values, for example, physician referral, emergency room, and transfer from a hospital  |  | no | 
| 10 | time_in_hospital | Feature | Integer |  | Integer   number of days between admission and discharge  | | no  | 
| 11 | payer_code | Feature | Categorical |  | Integer  identifier corresponding to 23 distinct values, for example, Blue Cross/Blue Shield, Medicare, and self-pay |  | yes  | 
| 12 | medical_specialty | Feature | Categorical |  | Integer   identifier of a specialty of the admitting physician, corresponding to 84 distinct values, for example, cardiology, internal medicine, family/general practice, and surgeon |  | yes |
| 13 | num_lab_procedures | Feature | Integer | |  Number of lab tests performed during the encounter |  | no |
| 14 | num_procedures | Feature | Integer |  | Number of procedures (other than lab tests) performed during the encounter |   |  no | 
| 15 | num_medications | Feature | Integer  |  | Number of distinct generic names administered during the encounter |   |  no |  
| 16 | number_outpatient | Feature | Integer  |  | Number of outpatient visits of the patient in the year preceding the encounter |  | no| 
| 17 | number_emergency | Feature | Integer |  |  Number of emergency visits of the patient in the year preceding the encounter |  | no| 
| 18 | number_inpatient | Feature | Integer |  | Number of inpatient visits of the patient in the year preceding the encounter |  | no| 
| 19 | diag_1 | Feature | Categorical |  |  The primary diagnosis (coded as first three digits of ICD9); 848 distinct values |  | yes | 
| 20 | diag_2 | Feature | Categorical |  |  Secondary diagnosis (coded as first three digits of ICD9); 923 distinct values|  | yes| 
| 21 | diag_3 | Feature | Categorical |  |  Additional secondary diagnosis (coded as first three digits of ICD9); 954 distinct values |  | yes | 
| 22 | number_diagnoses | Feature | Integer | |  Number of diagnoses entered to the system	 |  | no | 
| 23 | max_glu_serum | Feature | Categorical | |  Indicates the range of the result or if the test was not taken. Values: >200, >300, normal, and none if not measured |  | no | 
| 24 | A1Cresult | Feature | Categorical | |  Indicates the range of the result or if the test was not taken. Values: >8 if the result was greater than 8%, >7 if the result was greater than 7% but less than 8%, normal if the result was less than 7%, and none if not measured. |  | no | 
| 25 | metformin | Feature | Categorical | |  The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed	 |  | no | 
| 26 | repaglinide | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 27 | nateglinide | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 28 | chlorpropamide | Feature	 | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 29 | glimepiride | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 30 | acetohexamide | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 31 | glipizide | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 32 | glyburide | 	Feature | 	Categorical	 |  | 	The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 33 | tolbutamide | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 34 | pioglitazone | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 35 | rosiglitazone | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 36 | acarbose | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 37 | miglitol | Feature | Categorical	 |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 38 | troglitazone | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 39 | tolazamide | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 40 | examide | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 41| citoglipton | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 42 | insulin | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 43 | glyburide-metformin | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 44 | glipizide-metformin | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 45 | glimepiride-pioglitazone | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 46 | metformin-rosiglitazone | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 47 | metformin-pioglitazone | Feature | Categorical |  | The feature indicates whether the drug was prescribed or there was a change in the dosage. Values: up if the dosage was increased during the encounter, down if the dosage was decreased, steady if the dosage did not change, and no if the drug was not prescribed |  | no | 
| 48 | change | Feature | Categorical |  | Indicates if there was a change in diabetic medications (either dosage or generic name). Values: change and no change |  | no | 
| 49 | iabetesMed | Feature | Categorical |  | Indicates if there was any diabetic medication prescribed. Values: yes and no |  | no | 
| 50 | readmitted | Target | Categorical |  | Days to inpatient readmission. Values: <30 if the patient was readmitted in less than 30 days, >30 if the patient was readmitted in more than 30 days, and No for no record of readmission. |  | no | 

## 2.2.	Variable categorization (count of numeric and categorical)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      99493 non-null   object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    3197 non-null    object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                61510 non-null   object
 11  medical_specialty         51817 non-null   object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [14]:
df.select_dtypes(include = 'object').columns

Index(['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty',
       'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [15]:
df.select_dtypes(include = 'number').columns

Index(['encounter_id', 'patient_nbr', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses'],
      dtype='object')

## 2.3.	Pre-Processing Data Analysis (count of missing/ null values, redundant columns, etc.)

In [17]:
null_count = df.isnull().sum()
null_percentage = round((null_count/df.shape[0]) * 100,2)
null_summary = pd.DataFrame({
    'Missing Count': null_count,
    'Missing Percentage': null_percentage
})
null_summary[null_summary['Missing Count']>0]

Unnamed: 0,Missing Count,Missing Percentage
race,2273,2.23
weight,98569,96.86
payer_code,40256,39.56
medical_specialty,49949,49.08
diag_1,21,0.02
diag_2,358,0.35
diag_3,1423,1.4


#### Observation:
The dataset contains several columns with missing values. Notably, weight has the highest percentage of missing data at 96.86%, followed by medical_specialty (49.08%) and payer_code (39.56%). Race shows moderate missing data at 2.23%. Columns diag_1, diag_2, and diag_3 have minimal missing values at 0.02%, 0.35%, and 1.40%, respectively. Proper imputation or handling strategies will be required to address these missing values to maintain data integrity.

In [21]:
df.drop(['weight'], inplace = True, axis = 1)

### 2.3.1 Unwanted Columns
ID variables do not add value to our analysis and we need to drop them from the dataset.
There are two ID variables, namely: 
* 1) encounter_id
* 2) patient_nbr

In [23]:
df.drop(['encounter_id', 'patient_nbr'], inplace = True, axis = 1)

### Create a derived column, Discharget to from the variable, discharge_disposition_id

Delete the rows pertaining to those patients who expired indicated by the values 11, 12, 18, 19, 20,21,25, 26 in the column, discharge_disposition_id

In [25]:
Home                       = [1, 13] # values in the column, discharge_disposition_id
Facility                   = [2, 3, 4, 5, 6, 8, 9, 10, 14, 15, 16, 17, 22, 23, 24, 30, 27, 28, 29]
Home_w_homecare            = [6, 8]
Expired                    = [11, 12, 18, 19, 20,21,25, 26]
Others                     = [7]

### Remove the records pertaining to the patients who expired

In [27]:
expired_patients = df.loc[df['discharge_disposition_id'].isin(Expired), :]
print("\n%d patients expired" % expired_patients.shape[0])


6335 patients expired


In [28]:
df.drop(df[df['discharge_disposition_id'].isin(Expired)].index, inplace = True)

In [29]:
df['Discharged to']      =   999

In [30]:
df.loc[df['discharge_disposition_id'].isin(Facility), 'Discharged to']          =  'Facility'
df.loc[df['discharge_disposition_id'].isin(Home_w_homecare), 'Discharged to']   =  'Home with home care'
df.loc[df['discharge_disposition_id'].isin(Home), 'Discharged to']              =  'Home'
df.loc[df['discharge_disposition_id'].isin(Others), 'Discharged to']            =  'Others'

### After creating the variable, Discharged to, we need to drop the column, discharge_disposition_id

In [32]:
df.drop(df[df['Discharged to'].isin(Expired)].index, inplace = True)

In [33]:
df.drop(['discharge_disposition_id'], inplace = True, axis = 1)

### 2.3.2 Check the Target variable
We shall derive the target variable, 'Target' as follows:
* Target value: 0 -- Readmitted NO and Readmitted > 30
* Target value: 1 -- Readmitted < 30

In [35]:
# create a list of our conditions
conditions = [
        (df['readmitted'] ==  '>30') | (df['readmitted'] == 'NO'),    
        (df['readmitted'] == '<30' )
]
# create a list of the values we want to assign for each condition
values = [0, 1]

# create a new column and use np.select to assign values to it using our lists as arguments
df['Target'] = np.select(conditions, values)

In [36]:
drop_cols= 'readmitted' ## We remove the old variable, readmitted which is used to derive our target variable
df.drop(drop_cols, inplace = True, axis = 1)

### 2.3.3 Checking for the Constant Value Columns

In [38]:
df.columns[df.nunique() == 1]

Index(['examide', 'citoglipton'], dtype='object')

#### Observation:

We need to remove the columns with constant values from our dataset.
We have constant values in the two columns, namely:
* examide
* citoglipton

In [40]:
df.drop(['examide', 'citoglipton'], inplace = True, axis = 1)

### 2.3.4 Check for duplicated rows

In [46]:
df.duplicated(keep = 'first').sum()

0

#### Observation:
There are no duplicates in our dataset.

### 2.3.5 Handling Missing Values

In [54]:
enc_df=df.copy()

In [58]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor

# Function to reverse a dictionary
def rev_dict(d1):
    return dict(map(lambda key: (d1[key], key), d1.keys()))

le = LabelEncoder()

# Assuming enc_df is the DataFrame with 12 numerical and 33 categorical columns
# Replace this with the actual DataFrame or load it as needed

# Example structure for enc_df (you should use your actual data here)
# enc_df = pd.read_csv("your_dataset.csv")

# Specify the categorical columns in your DataFrame (this is just an example)
# You need to replace these with the actual categorical columns in enc_df
LABEL_COL = df.select_dtypes(exclude=np.number).columns

# Label encoding function
def label(df, LABEL_COL):
    _df = df.copy()
    dics_list = []

    for col in LABEL_COL:
        # Not NaN index
        idx = ~_df[col].isna()
        _df.loc[idx, col] = le.fit(_df.loc[idx, col]).transform(_df.loc[idx, col])
        d_ = {l: i for i, l in enumerate(le.classes_)}
        dics_list.append(dict(d_))
        
    return (_df, dics_list)

print("\nBefore Label Encoding")
print(enc_df.head(12).T)

# Perform label encoding on the specified categorical columns
transformed, dicts_l = label(enc_df, LABEL_COL)

print("\nAfter Label Encoding")
print("Dataset with Missing Values:")
print(transformed.head(12).T)

### Impute missing values
# Initialize IterativeImputer with DecisionTreeRegressor
imputer = IterativeImputer(estimator=DecisionTreeRegressor(), max_iter=10, random_state=0)
imputed_data = imputer.fit_transform(transformed)

imputed_df = pd.DataFrame(imputed_data, columns=transformed.columns)

print("\nImputed Dataset:")
print(imputed_df.head(12))

print("\nMapping")
print(dicts_l)

# Reverse the label encoding for the categorical columns
Null_cols = LABEL_COL  # All the columns you used for label encoding
reversed_dicts = [rev_dict(dic) for dic in dicts_l]

for i, col in enumerate(Null_cols):
    dic = reversed_dicts[i]
    print("\nColumn: {}".format(col))
    print("\nReverse Mapping: {}".format(dic))
    imputed_df.replace({col: dic}, inplace=True)

# Only imputed data is now used, without combining with the original DataFrame
print("\nFinal Imputed Dataset:")
print(imputed_df)


Before Label Encoding
                                 1                2          3          4   \
race                      Caucasian  AfricanAmerican  Caucasian  Caucasian   
gender                       Female           Female       Male       Male   
age                         [10-20)          [20-30)    [30-40)    [40-50)   
admission_type_id                 1                1          1          1   
admission_source_id               7                7          7          7   
time_in_hospital                  3                2          2          1   
payer_code                      NaN              NaN        NaN        NaN   
medical_specialty               NaN              NaN        NaN        NaN   
num_lab_procedures               59               11         44         51   
num_procedures                    0                5          1          0   
num_medications                  18               13         16          8   
number_outpatient                 0      

In [62]:
imputed_df.shape[0]

95431

In [65]:
print("\nDataset available for processing {}".format(imputed_df.shape))


Dataset available for processing (95431, 45)


In [68]:
imputed_df.head()

Unnamed: 0,race,gender,age,admission_type_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,Discharged to,Target
0,Caucasian,Female,[10-20),1.0,7.0,3.0,MD,Emergency/Trauma,59.0,0.0,18.0,0.0,0.0,0.0,276,250.01,255,9.0,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,Home,0.0
1,AfricanAmerican,Female,[20-30),1.0,7.0,2.0,HM,Urology,11.0,5.0,13.0,2.0,0.0,1.0,648,250.0,V27,6.0,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,Home,0.0
2,Caucasian,Male,[30-40),1.0,7.0,2.0,OG,InternalMedicine,44.0,1.0,16.0,0.0,0.0,0.0,8,250.43,403,7.0,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,Home,0.0
3,Caucasian,Male,[40-50),1.0,7.0,1.0,OG,InternalMedicine,51.0,0.0,8.0,0.0,0.0,0.0,197,157.0,250,5.0,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,Home,0.0
4,Caucasian,Male,[50-60),2.0,2.0,3.0,HM,Radiologist,31.0,6.0,16.0,0.0,0.0,0.0,414,411.0,250,9.0,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,Home,0.0


In [82]:
from datetime import datetime

current_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")

clean_file_name_ = (
    r"C:\Users\Rahul\OneDrive\Desktop\Projects\Predicting Early Readmission for Diabetic Patients\Cleaned_data_"
    + str(current_datetime)
    + ".csv"
)

model_file_name_ = (
    r"C:\Users\Rahul\OneDrive\Desktop\Projects\Predicting Early Readmission for Diabetic Patients\Model_data_"
    + str(current_datetime)
    + ".csv"
)


In [84]:
imputed_df.to_csv(clean_file_name_, index = False)
# D:\DSE-Capstone\DSE-FT-B-JUN24-G2\Data\250108

In [86]:
df_for_model= pd.DataFrame(imputed_data, columns=transformed.columns)
df_for_model.head()

Unnamed: 0,race,gender,age,admission_type_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,Discharged to,Target
0,2.0,0.0,1.0,1.0,7.0,3.0,8.0,8.0,59.0,0.0,18.0,0.0,0.0,0.0,140.0,78.0,121.0,9.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,2.0,1.0,7.0,2.0,6.0,71.0,11.0,5.0,13.0,2.0,0.0,1.0,451.0,77.0,760.0,6.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
2,2.0,1.0,3.0,1.0,7.0,2.0,10.0,18.0,44.0,1.0,16.0,0.0,0.0,0.0,551.0,96.0,247.0,7.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,2.0,1.0,4.0,1.0,7.0,1.0,10.0,18.0,51.0,0.0,8.0,0.0,0.0,0.0,51.0,23.0,86.0,5.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2.0,1.0,5.0,2.0,2.0,3.0,6.0,52.0,31.0,6.0,16.0,0.0,0.0,0.0,260.0,245.0,86.0,9.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


In [88]:
df_for_model.to_csv(model_file_name_, index = False)

## 2.4.	Alternate sources of data that can supplement the core dataset (at least 2-3 columns)

## 2.5. Project Justification - Project Statement, Complexity involved, Project Outcome

## 2.6. Commercial, Academic or Social value