In [20]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [93]:
df = pd.read_csv('/Users/taddbackus/School/fall23/qtw/cs2/dataset_diabetes/diabetic_data.csv')
mappings = pd.read_csv('/Users/taddbackus/School/fall23/qtw/cs2/dataset_diabetes/IDs_mapping.csv')
print(mappings)

   admission_type_id                                        description
0                  1                                          Emergency
1                  2                                             Urgent
2                  3                                           Elective
3                  4                                            Newborn
4                  5                                      Not Available
..               ...                                                ...
62                22   Transfer from hospital inpt/same fac reslt in...
63                23                          Born inside this hospital
64                24                         Born outside this hospital
65                25            Transfer from Ambulatory Surgery Center
66                26                              Transfer from Hospice

[67 rows x 2 columns]


In [94]:
df = df.replace('?',np.NaN)
print(df.isnull().sum())
print('\n')

encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [95]:
print(df.isnull().sum() / len(df) * 100)
print('\n')

encounter_id                 0.000000
patient_nbr                  0.000000
race                         2.233555
gender                       0.000000
age                          0.000000
weight                      96.858479
admission_type_id            0.000000
discharge_disposition_id     0.000000
admission_source_id          0.000000
time_in_hospital             0.000000
payer_code                  39.557416
medical_specialty           49.082208
num_lab_procedures           0.000000
num_procedures               0.000000
num_medications              0.000000
number_outpatient            0.000000
number_emergency             0.000000
number_inpatient             0.000000
diag_1                       0.020636
diag_2                       0.351787
diag_3                       1.398306
number_diagnoses             0.000000
max_glu_serum                0.000000
A1Cresult                    0.000000
metformin                    0.000000
repaglinide                  0.000000
nateglinide 

I just removed the columns that had a large amount of values. We can look into potential better ways to do this later.

In [96]:
df = df.drop(['weight','payer_code','medical_specialty'],axis=1)
print(df.isnull().sum() / len(df) * 100)
print('\n')

encounter_id                0.000000
patient_nbr                 0.000000
race                        2.233555
gender                      0.000000
age                         0.000000
admission_type_id           0.000000
discharge_disposition_id    0.000000
admission_source_id         0.000000
time_in_hospital            0.000000
num_lab_procedures          0.000000
num_procedures              0.000000
num_medications             0.000000
number_outpatient           0.000000
number_emergency            0.000000
number_inpatient            0.000000
diag_1                      0.020636
diag_2                      0.351787
diag_3                      1.398306
number_diagnoses            0.000000
max_glu_serum               0.000000
A1Cresult                   0.000000
metformin                   0.000000
repaglinide                 0.000000
nateglinide                 0.000000
chlorpropamide              0.000000
glimepiride                 0.000000
acetohexamide               0.000000
g

In [97]:
raceEncoded = pd.get_dummies(df['race'])
genderEncoded = pd.get_dummies(df['gender'])
maxGluEncoded = pd.get_dummies(df['max_glu_serum'])
dfEncoded = df.drop(['race','gender','max_glu_serum'],axis=1)
dfEncoded = dfEncoded.join(raceEncoded)
dfEncoded = dfEncoded.join(genderEncoded)
dfEncoded = dfEncoded.join(maxGluEncoded)


One hot encoding race, gender, and max glucose.

We need to figure out what to do with the following columns still

In [98]:
varToChange = ['A1Cresult','metformin','repaglinide','nateglinide','chlorpropamide',
               'glimepiride','acetohexamide','glipizide','glyburide','tolbutamide',
               'pioglitazone','acarbose','miglitol','troglitazone','tolazamide',
               'examide','citoglipton','insulin','glyburide-metformin','glimepiride-pioglitazone',
               'metformin-rosiglitazone','metformin-pioglitazone']
for i in varToChange:
    print(i)
    for j in dfEncoded[i].unique():
        print(j)
    print('==============')

A1Cresult
None
>7
>8
Norm
metformin
No
Steady
Up
Down
repaglinide
No
Up
Steady
Down
nateglinide
No
Steady
Down
Up
chlorpropamide
No
Steady
Down
Up
glimepiride
No
Steady
Down
Up
acetohexamide
No
Steady
glipizide
No
Steady
Up
Down
glyburide
No
Steady
Up
Down
tolbutamide
No
Steady
pioglitazone
No
Steady
Up
Down
acarbose
No
Steady
Up
Down
miglitol
No
Steady
Down
Up
troglitazone
No
Steady
tolazamide
No
Steady
Up
examide
No
citoglipton
No
insulin
No
Up
Steady
Down
glyburide-metformin
No
Steady
Down
Up
glimepiride-pioglitazone
No
Steady
metformin-rosiglitazone
No
Steady
metformin-pioglitazone
No
Steady


In [99]:
dfEncoded.loc[dfEncoded['change'] == 'Ch', 'change'] = 1
dfEncoded.loc[dfEncoded['change'] == 'No', 'change'] = 0
dfEncoded.loc[dfEncoded['diabetesMed'] == 'No', 'diabetesMed'] = 0
dfEncoded.loc[dfEncoded['diabetesMed'] == 'Yes', 'diabetesMed'] = 1

In [100]:
dfEncoded.loc[dfEncoded['age'] == '[0-10)', 'age'] = 5
dfEncoded.loc[dfEncoded['age'] == '[10-20)', 'age'] = 15
dfEncoded.loc[dfEncoded['age'] == '[20-30)', 'age'] = 25
dfEncoded.loc[dfEncoded['age'] == '[30-40)', 'age'] = 35
dfEncoded.loc[dfEncoded['age'] == '[40-50)', 'age'] = 45
dfEncoded.loc[dfEncoded['age'] == '[50-60)', 'age'] = 55
dfEncoded.loc[dfEncoded['age'] == '[60-70)', 'age'] = 65
dfEncoded.loc[dfEncoded['age'] == '[70-80)', 'age'] = 75
dfEncoded.loc[dfEncoded['age'] == '[80-90)', 'age'] = 85
dfEncoded.loc[dfEncoded['age'] == '[90-100)', 'age'] = 95

Changed age to midpoint of the given range. Should be easy enough to change if theres a better way

Below:   
In the diagonsis columns there were values like V57 and E909. I dont know what these mean so i temporarily changed them to just the number following the letter so that i could make the column a float. I asked about this column in the presession but he didnt respond so we should ask in class.

In [101]:
dfEncoded['diag_1'] = dfEncoded['diag_1'].astype(str)
for i in dfEncoded['diag_1'].unique():
    if i[0] == 'V' or i[0] == 'E':
        dfEncoded.loc[dfEncoded['diag_1'] == i, 'diag_1'] = i[1:]
dfEncoded['diag_1'] = dfEncoded['diag_1'].astype(float)

In [102]:
dfEncoded['diag_2'] = dfEncoded['diag_2'].astype(str)
for i in dfEncoded['diag_2'].unique():
    if i[0] == 'V' or i[0] == 'E':
        dfEncoded.loc[dfEncoded['diag_2'] == i, 'diag_2'] = i[1:]
dfEncoded['diag_2'] = dfEncoded['diag_2'].astype(float)

In [103]:
dfEncoded['diag_3'] = dfEncoded['diag_3'].astype(str)
for i in dfEncoded['diag_3'].unique():
    if i[0] == 'V' or i[0] == 'E':
        dfEncoded.loc[dfEncoded['diag_3'] == i, 'diag_3'] = i[1:]
dfEncoded['diag_3'] = dfEncoded['diag_3'].astype(float)

In [104]:
X = dfEncoded.drop(['readmitted'],axis=1)
y = dfEncoded['readmitted']
print(X.shape)
print(y.shape)

(101766, 55)
(101766,)


In [None]:
model = LogisticRegression()

In [None]:
cross_val_score(model,X,y,scoring='accuracy')

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1196, in fit
    X, y = self._validate_data(
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 1993, in __array__
    return np.asarray(self._values, dtype=dtype)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/numpy/core/_asarray.py", line 102, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'None'
