In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("diabetes_data.csv")
data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [3]:
data.columns = data.columns.str.lower().str.replace(' ','_')
data.columns

Index(['age', 'gender', 'polyuria', 'polydipsia', 'sudden_weight_loss',
       'weakness', 'polyphagia', 'genital_thrush', 'visual_blurring',
       'itching', 'irritability', 'delayed_healing', 'partial_paresis',
       'muscle_stiffness', 'alopecia', 'obesity', 'class'],
      dtype='object')

In [4]:
unimportant_features = ['muscle_stiffness', 'partial_paresis',
'visual_blurring', 'polyphagia', 'sudden_weight_loss']

In [5]:
data.drop(columns=unimportant_features, axis=1, inplace= True)
data.columns

Index(['age', 'gender', 'polyuria', 'polydipsia', 'weakness', 'genital_thrush',
       'itching', 'irritability', 'delayed_healing', 'alopecia', 'obesity',
       'class'],
      dtype='object')

In [6]:
data.head()

Unnamed: 0,age,gender,polyuria,polydipsia,weakness,genital_thrush,itching,irritability,delayed_healing,alopecia,obesity,class
0,40,Male,No,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,Yes,No,No,No,No,Yes,No,Positive
2,41,Male,Yes,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,No,Yes,No,No,Positive
4,60,Male,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Positive


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              520 non-null    int64 
 1   gender           520 non-null    object
 2   polyuria         520 non-null    object
 3   polydipsia       520 non-null    object
 4   weakness         520 non-null    object
 5   genital_thrush   520 non-null    object
 6   itching          520 non-null    object
 7   irritability     520 non-null    object
 8   delayed_healing  520 non-null    object
 9   alopecia         520 non-null    object
 10  obesity          520 non-null    object
 11  class            520 non-null    object
dtypes: int64(1), object(11)
memory usage: 48.9+ KB


In [8]:
# Encoding features
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

In [9]:
# removing gender and class from label encoding
col_to_label_encode = ['polyuria', 'polydipsia', 'weakness', 'genital_thrush',
       'itching', 'irritability', 'delayed_healing', 'alopecia', 'obesity']

In [10]:
for col in col_to_label_encode:
    data[col] = LE.fit_transform(data[col].astype(str))

In [11]:
data.head()

Unnamed: 0,age,gender,polyuria,polydipsia,weakness,genital_thrush,itching,irritability,delayed_healing,alopecia,obesity,class
0,40,Male,0,1,1,0,1,0,1,1,1,Positive
1,58,Male,0,0,1,0,0,0,0,1,0,Positive
2,41,Male,1,0,1,0,1,0,1,1,0,Positive
3,45,Male,0,0,1,1,1,0,1,0,0,Positive
4,60,Male,1,1,1,0,1,1,1,1,1,Positive


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              520 non-null    int64 
 1   gender           520 non-null    object
 2   polyuria         520 non-null    int32 
 3   polydipsia       520 non-null    int32 
 4   weakness         520 non-null    int32 
 5   genital_thrush   520 non-null    int32 
 6   itching          520 non-null    int32 
 7   irritability     520 non-null    int32 
 8   delayed_healing  520 non-null    int32 
 9   alopecia         520 non-null    int32 
 10  obesity          520 non-null    int32 
 11  class            520 non-null    object
dtypes: int32(9), int64(1), object(2)
memory usage: 30.6+ KB


In [13]:
gender_map = {'Female':0, 'Male':1}
class_map = {'Negative':0, 'Positive':1}

In [14]:
data['gender'] = data['gender'].map(gender_map)
data['class'] = data['class'].map(class_map)

In [15]:
data.head()

Unnamed: 0,age,gender,polyuria,polydipsia,weakness,genital_thrush,itching,irritability,delayed_healing,alopecia,obesity,class
0,40,1,0,1,1,0,1,0,1,1,1,1
1,58,1,0,0,1,0,0,0,0,1,0,1
2,41,1,1,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,0,1,0,0,1
4,60,1,1,1,1,0,1,1,1,1,1,1


In [16]:
data_new = data.copy()

In [17]:
data_new.head()

Unnamed: 0,age,gender,polyuria,polydipsia,weakness,genital_thrush,itching,irritability,delayed_healing,alopecia,obesity,class
0,40,1,0,1,1,0,1,0,1,1,1,1
1,58,1,0,0,1,0,0,0,0,1,0,1
2,41,1,1,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,0,1,0,0,1
4,60,1,1,1,1,0,1,1,1,1,1,1


In [18]:
data_new.iloc[1]

age                58
gender              1
polyuria            0
polydipsia          0
weakness            1
genital_thrush      0
itching             0
irritability        0
delayed_healing     0
alopecia            1
obesity             0
class               1
Name: 1, dtype: int64

In [59]:
data.iloc[1]

age                58
gender              1
polyuria            0
polydipsia          0
weakness            1
genital_thrush      0
itching             0
irritability        0
delayed_healing     0
alopecia            1
obesity             0
class               1
Name: 1, dtype: int64

In [58]:
data.iloc[200]

age                40
gender              1
polyuria            0
polydipsia          1
weakness            1
genital_thrush      0
itching             1
irritability        0
delayed_healing     0
alopecia            0
obesity             0
class               0
Name: 200, dtype: int64

In [19]:
data_new= data_new.drop(['class'], axis=1)

In [20]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   age              520 non-null    int64
 1   gender           520 non-null    int64
 2   polyuria         520 non-null    int32
 3   polydipsia       520 non-null    int32
 4   weakness         520 non-null    int32
 5   genital_thrush   520 non-null    int32
 6   itching          520 non-null    int32
 7   irritability     520 non-null    int32
 8   delayed_healing  520 non-null    int32
 9   alopecia         520 non-null    int32
 10  obesity          520 non-null    int32
dtypes: int32(9), int64(2)
memory usage: 26.5 KB


In [21]:
data_new.head()

Unnamed: 0,age,gender,polyuria,polydipsia,weakness,genital_thrush,itching,irritability,delayed_healing,alopecia,obesity
0,40,1,0,1,1,0,1,0,1,1,1
1,58,1,0,0,1,0,0,0,0,1,0
2,41,1,1,0,1,0,1,0,1,1,0
3,45,1,0,0,1,1,1,0,1,0,0
4,60,1,1,1,1,0,1,1,1,1,1


In [22]:
data_new.iloc[1]

age                58
gender              1
polyuria            0
polydipsia          0
weakness            1
genital_thrush      0
itching             0
irritability        0
delayed_healing     0
alopecia            1
obesity             0
Name: 1, dtype: int64

In [26]:
patient_1 = data_new.iloc[1]

In [27]:
patient_1

age                58
gender              1
polyuria            0
polydipsia          0
weakness            1
genital_thrush      0
itching             0
irritability        0
delayed_healing     0
alopecia            1
obesity             0
Name: 1, dtype: int64

In [56]:
patient_2 = data_new.iloc[200]
patient_2

age                40
gender              1
polyuria            0
polydipsia          1
weakness            1
genital_thrush      0
itching             1
irritability        0
delayed_healing     0
alopecia            0
obesity             0
Name: 200, dtype: int64

In [30]:
pat_json = patient_1.to_json(orient='columns')
pat_json

'{"age":58,"gender":1,"polyuria":0,"polydipsia":0,"weakness":1,"genital_thrush":0,"itching":0,"irritability":0,"delayed_healing":0,"alopecia":1,"obesity":0}'

In [45]:
type(pat_json)

str

In [34]:
type(patient_1)

pandas.core.series.Series

In [41]:
df = pd.read_json(pat_json, orient='index')
df

Unnamed: 0,0
age,58
gender,1
polyuria,0
polydipsia,0
weakness,1
genital_thrush,0
itching,0
irritability,0
delayed_healing,0
alopecia,1


In [66]:
res = np.array(df).reshape(1,-1)

In [67]:
res

array([[58,  1,  0,  0,  1,  0,  0,  0,  0,  1,  0]], dtype=int64)

In [46]:
import joblib
rf = joblib.load('random_forest_model_diabetes_refined_31_5_2021.pkl') # Load "model.pkl"


In [77]:
prediction = rf.predict_proba(res)
prediction

array([[0.4, 0.6]])

In [78]:
prediction[0, 1]

0.6