In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import scipy.stats as stats

from sklearn import preprocessing 
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.svm import SVR, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
# from sklearn.model_selection import GridSearchCV
# from sklearn.preprocessing import Imputer

#import xgboost as xgb
from functools import reduce
from imblearn.over_sampling import SMOTE
import itertools
import copy
import warnings
warnings.filterwarnings('ignore') 
%matplotlib inline

## Data normalization

In [2]:
def scaler(df,feature):
    X = np.array(df[feature])
    X = preprocessing.scale(X)
    df[feature] = X
    return df

In [3]:
df0 = pd.read_csv('Diabetes Project Part 1.csv')

In [4]:
df = df0.copy(deep=True)
print(df.columns.values)
df.head()

['12drinksayear' '5drinks2hours' '5drinksaday' 'BMI' 'LDL' 'age'
 'diabetes' 'fasting_glucose' 'race' 'sitting' 'sleeping' 'sugar' 'male'
 'female' 'sysBP' 'diaBP']


Unnamed: 0,12drinksayear,5drinks2hours,5drinksaday,BMI,LDL,age,diabetes,fasting_glucose,race,sitting,sleeping,sugar,male,female,sysBP,diaBP
0,1.0,0.0,1.0,30.8,3.0,53.0,0.0,5.59,3.0,5.0,8.0,0.75,1,0,140.0,86.0
1,1.0,0.0,1.0,28.8,2.0,78.0,1.0,4.66,3.0,8.0,7.0,1.6,1,0,135.333333,45.333333
2,0.0,0.0,0.0,20.3,2.0,42.0,0.0,4.66,4.0,9.0,7.718424,2.75,0,1,104.0,60.0
3,0.0,0.0,0.0,28.6,2.0,72.0,0.0,5.93,1.0,0.166667,9.0,0.07,0,1,119.333333,58.666667
4,1.0,0.0,1.0,28.0,2.0,22.0,0.0,5.27,4.0,9.0,6.5,1.085,1,0,111.333333,72.666667


* Numerical variables: '5drinks2hours', 'BMI', 'fasting_glucose', 'sitting', 'sleeping', 'sugar', 'sysBP', 'diaBP'
* Categorical variables: '12drinksayear', '5drinksaday', 'LDL', 'age', 'diabetes', 'race', 'male', 'female'

In [5]:
numerical_variables = ['5drinks2hours', 'BMI','age','fasting_glucose', 'sitting', 'sleeping', 'sugar', 'sysBP', 'diaBP']
categorical_variables = ['12drinksayear', '5drinksaday', 'LDL', 'diabetes', 'race', 'male', 'female']

### 1. Scale numerical variables

In [6]:
df = scaler(df,'5drinks2hours')
df = scaler(df,'BMI')
df = scaler(df,'age')
df = scaler(df,'fasting_glucose')
df = scaler(df,'sitting')
df = scaler(df,'sleeping')
df = scaler(df,'sugar')
df = scaler(df,'sysBP')
df = scaler(df,'diaBP')

Now compare before scaling and after scaling: 

In [7]:
for feature in numerical_variables:
    print('Before scaling:')
    print(df0[feature].describe())
    print('After scaling:')
    print(df[feature].describe())
    print('-------------------------')    

Before scaling:
count    2600.000000
mean        0.203846
std         1.399439
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        20.000000
Name: 5drinks2hours, dtype: float64
After scaling:
count    2.600000e+03
mean     6.410470e-16
std      1.000192e+00
min     -1.456907e-01
25%     -1.456907e-01
50%     -1.456907e-01
75%     -1.456907e-01
max      1.414849e+01
Name: 5drinks2hours, dtype: float64
-------------------------
Before scaling:
count    2600.000000
mean       29.215271
std         6.930430
min        15.100000
25%        24.300000
50%        28.200000
75%        32.900000
max        64.500000
Name: BMI, dtype: float64
After scaling:
count    2.600000e+03
mean    -1.253484e-16
std      1.000192e+00
min     -2.037101e+00
25%     -7.093667e-01
50%     -1.465229e-01
75%      5.317761e-01
max      5.092255e+00
Name: BMI, dtype: float64
-------------------------
Before scaling:
count    2600.000000
mean       48.601538
std        18.47

In [8]:
df0[numerical_variables].head()

Unnamed: 0,5drinks2hours,BMI,age,fasting_glucose,sitting,sleeping,sugar,sysBP,diaBP
0,0.0,30.8,53.0,5.59,5.0,8.0,0.75,140.0,86.0
1,0.0,28.8,78.0,4.66,8.0,7.0,1.6,135.333333,45.333333
2,0.0,20.3,42.0,4.66,9.0,7.718424,2.75,104.0,60.0
3,0.0,28.6,72.0,5.93,0.166667,9.0,0.07,119.333333,58.666667
4,0.0,28.0,22.0,5.27,9.0,6.5,1.085,111.333333,72.666667


In [9]:
df[numerical_variables].head()

Unnamed: 0,5drinks2hours,BMI,age,fasting_glucose,sitting,sleeping,sugar,sysBP,diaBP
0,-0.145691,0.228706,0.238124,-0.305181,-0.346925,0.1779442,-0.748791,0.861028,1.443065
1,-0.145691,-0.059931,1.591575,-0.722354,0.545482,-0.4540141,-0.177046,0.598762,-1.928683
2,-0.145691,-1.286642,-0.357394,-0.722354,0.84295,-1.122584e-15,0.596492,-1.162165,-0.712643
3,-0.145691,-0.088795,1.266747,-0.152666,-1.78469,0.8099026,-1.206187,-0.300435,-0.823192
4,-0.145691,-0.175387,-1.440155,-0.448724,0.84295,-0.7699933,-0.523456,-0.750033,0.337574


### 2. Reduce levels in categorical variables

In [10]:
for feature in categorical_variables:
    print(df[feature].value_counts())
    print()

1.0    1817
0.0     783
Name: 12drinksayear, dtype: int64

0.0    2224
1.0     376
Name: 5drinksaday, dtype: int64

2.0    1358
1.0    1007
3.0     235
Name: LDL, dtype: int64

0.0    2210
1.0     390
Name: diabetes, dtype: int64

3.0    872
4.0    545
1.0    439
2.0    348
6.0    298
7.0     98
Name: race, dtype: int64

0    1321
1    1279
Name: male, dtype: int64

1    1321
0    1279
Name: female, dtype: int64



The number of levels in each categorical varibles is manageble. We leave them as they are for now. 

### Export

In [11]:
df.to_csv('Diabetes Project Part 2.csv',index=False,sep=',')