In [1]:
import pandas as pd
import numpy as np

In [2]:
diabetes = pd.read_csv('diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
diabetes['Glucose'].replace(0, np.nan, inplace= True)
diabetes['BloodPressure'].replace(0, np.nan, inplace= True)
diabetes['SkinThickness'].replace(0, np.nan, inplace= True)
diabetes['Insulin'].replace(0, np.nan, inplace= True)
diabetes['BMI'].replace(0, np.nan, inplace= True)


In [6]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [13]:
diabetes['Glucose'].isnull().sum() / len(diabetes)*100
diabetes['BloodPressure'].isnull().sum() / len(diabetes)*100
diabetes['SkinThickness'].isnull().sum() / len(diabetes)*100
diabetes['Insulin'].isnull().sum() / len(diabetes)*100
diabetes['BMI'].isnull().sum() / len(diabetes)*100


1.4322916666666665

In [17]:
#drop columns with more than 10% missing values

diabetes_trim = diabetes.dropna(thresh= int(diabetes.shape[0]*.9), axis= 1)
diabetes_trim.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [19]:
diabetes = pd.read_csv('dataset/diabetes_processed.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0,0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [22]:
X = diabetes.drop('Outcome', axis = 1)
Y = diabetes['Outcome']

In [24]:
#gives variance col wise
X.var(axis=0)

Pregnancies                   11.354056
Glucose                      926.489244
BloodPressure                146.321591
SkinThickness                 78.969986
Insulin                     9449.130490
BMI                           47.270664
DiabetesPedigreeFunction       0.109779
Age                          138.303046
dtype: float64

In [29]:
from sklearn.preprocessing import minmax_scale

x_scaled = pd.DataFrame(minmax_scale(X, feature_range=(0,10)), columns=X.columns)
x_scaled.var()

Pregnancies                 3.928739
Glucose                     3.856355
BloodPressure               1.523548
SkinThickness               0.933010
Insulin                     1.270004
BMI                         1.976851
DiabetesPedigreeFunction    2.001447
Age                         3.841751
dtype: float64

In [31]:
#selecting features with variance higher than 1

from sklearn.feature_selection import VarianceThreshold

select_features = VarianceThreshold(threshold= 1.0)

In [35]:
X_new = select_features.fit_transform(x_scaled)
X_new.shape

(768, 7)

In [37]:
var_df = pd.DataFrame({
    'feature names': list(x_scaled),
    'variances': select_features.variances_
})
var_df

Unnamed: 0,feature names,variances
0,Pregnancies,3.923624
1,Glucose,3.851334
2,BloodPressure,1.521565
3,SkinThickness,0.931795
4,Insulin,1.26835
5,BMI,1.974277
6,DiabetesPedigreeFunction,1.998841
7,Age,3.836749


In [39]:
X_new = pd.DataFrame(X_new)
X_new.head()

Unnamed: 0,0,1,2,3,4,5,6
0,3.529412,6.709677,4.897959,2.731339,3.149284,2.34415,4.833333
1,0.588235,2.645161,4.285714,1.007569,1.717791,1.16567,1.666667
2,4.705882,8.967742,4.081633,3.328912,1.042945,2.536294,1.833333
3,0.588235,2.903226,4.285714,1.281848,2.02454,0.380017,0.0
4,0.0,6.0,1.632653,2.139752,5.092025,9.436379,2.0


In [42]:
selected_fe = []
for i in range(len(X_new.columns)):
    for j in range(len(x_scaled.columns)):
        if(X_new.iloc[:,i].equals(x_scaled.iloc[:,j])):
            selected_fe.append(x_scaled.columns[j])

selected_fe

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']