In [142]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels as sm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer

# Prepare data for Machine Learning

In [183]:
#Scripts for preprocessing data for machine learning
#Borrowed heavily from https://machinelearningmastery.com/prepare-data-machine-learning-python-scikit-learn/


def prepare_data_from_csv(file_path_as_string, y_column_name, column_names=None, header=0):
    
    '''Import and prepare data'''
    
    dataframe = pd.read_csv(file_path_as_string, names=column_names, delimiter=',',header=header)
    if header != None:
        dataframe.columns = [x.replace(' ', '_') for x in dataframe.columns]
    print(dataframe.head())
    
    X = dataframe.drop(y_column_name, axis=1)
    y = dataframe[y_column_name]
    
    prepared_df = dataframe
    return prepared_df, X, y


def describe_data(prepared_df):
    
    ''' Print shape and descriptive statistics
        Returns printed shape and descriptive statistics
    '''
    
    print('Shape: ','\n'+'--'*25 + f'\n{prepared_df.shape}')
    print('--'*25)
    print('Nulls: ', '\n'+'--'*25 + f'\n{prepared_df.isnull().sum()}')
    print('--'*25, '\n'+'--'*25)
    print('Describe: ', '\n'+'--'*25 + f'\n{prepared_df.describe()}')
    print('--'*25, '\n'+'--'*25)
    
def rescale_data(X): 
    
    '''Rescale data to range from 0 to 1 
        Returns: scaler and rescaledX'''
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    rescaledX = scaler.fit_transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(rescaledX[0:5,:])
    
    return scaler, rescaledX


def standardize_data(X):
    
    ''''''
    
    stand_scaler = StandardScaler().fit(X)
    stand_rescaledX = scaler.transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(rescaledX[0:5,:])
    
    return stand_scaler, stand_rescaledX

    
    
def normalize_data(X):
    
    '''rescaling each observation (row) to have a length of 1 (called a unit norm in linear algebra). This 
    preprocessing can be useful for sparse datasets (lots of zeros) with attributes of varying scales when 
    using algorithms that weight input values such as neural networks and algorithms that use distance measures 
    such as K-Nearest Neighbors.
    
    Input: data
    Return: normalizedX(df)'''
    
    scaler = Normalizer().fit(X)
    normalizedX = scaler.transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(normalizedX[0:5,:])
    
    return scaler, normalizedX
    
def binarize_data(X):
    
    binarizer = Binarizer(threshold=0.0).fit(X)
    binaryX = binarizer.transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(binaryX[0:5,:])
    
    return binarizer, binaryX

In [184]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
prepared_df, X, y = prepare_data_from_csv('pima-indians-diabetes.data copy.csv', 'class', column_names=names, header=0)

   preg  plas  pres  skin  test  mass   pedi  age  class
0     1    85    66    29     0  26.6  0.351   31      0
1     8   183    64     0     0  23.3  0.672   32      1
2     1    89    66    23    94  28.1  0.167   21      0
3     0   137    40    35   168  43.1  2.288   33      1
4     5   116    74     0     0  25.6  0.201   30      0


In [185]:
describe_data(prepared_df)

Shape:  
--------------------------------------------------
(767, 9)
--------------------------------------------------
Nulls:  
--------------------------------------------------
preg     0
plas     0
pres     0
skin     0
test     0
mass     0
pedi     0
age      0
class    0
dtype: int64
-------------------------------------------------- 
--------------------------------------------------
Describe:  
--------------------------------------------------
             preg        plas        pres        skin        test        mass  \
count  767.000000  767.000000  767.000000  767.000000  767.000000  767.000000   
mean     3.842243  120.859192   69.101695   20.517601   79.903520   31.990482   
std      3.370877   31.978468   19.368155   15.954059  115.283105    7.889091   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   
50%      3.000000  117.000000   72.000000   23.000000   

In [189]:
resc_x = rescale_data(X)

[[0.059 0.427 0.541 0.293 0.    0.396 0.117 0.167]
 [0.471 0.92  0.525 0.    0.    0.347 0.254 0.183]
 [0.059 0.447 0.541 0.232 0.111 0.419 0.038 0.   ]
 [0.    0.688 0.328 0.354 0.199 0.642 0.944 0.2  ]
 [0.294 0.583 0.607 0.    0.    0.382 0.053 0.15 ]]


In [190]:
stand_x = standardize_data(X)

[[0.353 0.744 0.59  0.354 0.    0.501 0.234 0.483]
 [0.059 0.427 0.541 0.293 0.    0.396 0.117 0.167]
 [0.471 0.92  0.525 0.    0.    0.347 0.254 0.183]
 [0.059 0.447 0.541 0.232 0.111 0.419 0.038 0.   ]
 [0.    0.688 0.328 0.354 0.199 0.642 0.944 0.2  ]]


In [191]:
norm_x = normalize_data(X)

[[0.008 0.716 0.556 0.244 0.    0.224 0.003 0.261]
 [0.04  0.924 0.323 0.    0.    0.118 0.003 0.162]
 [0.007 0.588 0.436 0.152 0.622 0.186 0.001 0.139]
 [0.    0.596 0.174 0.152 0.731 0.188 0.01  0.144]
 [0.035 0.81  0.517 0.    0.    0.179 0.001 0.209]]


In [192]:
bin_x = binarize_data(X)

[[1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]]
