In [1]:
import pandas as pd
import numpy as np

# Get Data

In [2]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00529/diabetes_data_upload.csv")

In [3]:
data.columns

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')

In [4]:
data.shape

(520, 17)

# Data Prep

In [5]:
data.columns = [i.lower().replace(" ","_") for i in data.columns]

### Check null

In [6]:
data.isna().sum()

age                   0
gender                0
polyuria              0
polydipsia            0
sudden_weight_loss    0
weakness              0
polyphagia            0
genital_thrush        0
visual_blurring       0
itching               0
irritability          0
delayed_healing       0
partial_paresis       0
muscle_stiffness      0
alopecia              0
obesity               0
class                 0
dtype: int64

### Check dups

In [7]:
data.duplicated().sum()

269

In [8]:
data = data.drop_duplicates().reset_index(drop=True)

### Data Types

In [9]:
data.dtypes

age                    int64
gender                object
polyuria              object
polydipsia            object
sudden_weight_loss    object
weakness              object
polyphagia            object
genital_thrush        object
visual_blurring       object
itching               object
irritability          object
delayed_healing       object
partial_paresis       object
muscle_stiffness      object
alopecia              object
obesity               object
class                 object
dtype: object

### clean target col

In [10]:
data["class"].value_counts()

Positive    173
Negative     78
Name: class, dtype: int64

In [11]:
data["target"] = data.apply(lambda df: True if df["class"]=="Positive" else False, axis=1)

In [12]:
data["target"].value_counts()

True     173
False     78
Name: target, dtype: int64

In [13]:
data = data.drop(columns=["class"])

### clean feature cols

In [14]:
data.columns

Index(['age', 'gender', 'polyuria', 'polydipsia', 'sudden_weight_loss',
       'weakness', 'polyphagia', 'genital_thrush', 'visual_blurring',
       'itching', 'irritability', 'delayed_healing', 'partial_paresis',
       'muscle_stiffness', 'alopecia', 'obesity', 'target'],
      dtype='object')

In [15]:
target_col = "target"

In [16]:
feature_cols = [i for i in data.columns if "target" not in i]

In [17]:
data[feature_cols].dtypes

age                    int64
gender                object
polyuria              object
polydipsia            object
sudden_weight_loss    object
weakness              object
polyphagia            object
genital_thrush        object
visual_blurring       object
itching               object
irritability          object
delayed_healing       object
partial_paresis       object
muscle_stiffness      object
alopecia              object
obesity               object
dtype: object

In [18]:
bool_cols = [i for i in feature_cols if "age" not in i and "gender" not in i]

In [19]:
data[bool_cols] = data[bool_cols] == "Yes"

In [27]:
data.head().T

Unnamed: 0,0,1,2,3,4
age,40,58,41,45,60
polyuria,False,False,True,False,True
polydipsia,True,False,False,False,True
sudden_weight_loss,False,False,False,True,True
weakness,True,True,True,True,True
polyphagia,False,False,True,True,True
genital_thrush,False,False,False,True,False
visual_blurring,False,True,False,False,True
itching,True,False,True,True,True
irritability,False,False,False,False,True


### One Hot encode

In [21]:
data["is_male"] = data.gender == "Male"

In [25]:
data = data.drop(columns=["gender"])

# EDA

In [30]:
feature_cols = [i for i in data.columns if "target" not in i]

In [31]:
feature_cols

['age',
 'polyuria',
 'polydipsia',
 'sudden_weight_loss',
 'weakness',
 'polyphagia',
 'genital_thrush',
 'visual_blurring',
 'itching',
 'irritability',
 'delayed_healing',
 'partial_paresis',
 'muscle_stiffness',
 'alopecia',
 'obesity',
 'is_male']

In [32]:
target_col

'target'

## Age