In [79]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels as sm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer

# Prepare data for Machine Learning

In [104]:
#Scripts for preprocessing data for machine learning
#Borrowed generously from https://machinelearningmastery.com/prepare-data-machine-learning-python-scikit-learn/


class Prepare:  
    '''
    Prepare data for machine learning in Python using scikit-learn.
    
    Functions: prepare_data_from_csv, describe_data, rescale_data, standardize_data, normalize_data, binarize_data
    Input: file_path_as_string, y_column_name, column_names=None, header=0
    '''
    
    def __init__(self, file_path_as_string, y_column_name, column_names, header):
        self.file_path_as_string = file_path_as_string 
        self.y_column_name  = y_column_name
        self.column_names = column_names
        self.header = 0



    def prepare_data_from_csv(self):

        '''Import and prepare data'''

        dataframe = pd.read_csv(self.file_path_as_string, names=self.column_names, delimiter=',',header=self.header)
        
        if self.header != None:
            dataframe.columns = [x.replace(' ', '_') for x in dataframe.columns]
        print(dataframe.head())

        self.X = dataframe.drop(self.y_column_name, axis=1)
        self.y = dataframe[self.y_column_name]

        prepared_df = dataframe
        #self.prepared_df = dataframe
        
        return prepared_df, self.X, self.y #self.prepared_df


    def describe_data(self, prepared_df):

        ''' Print shape and descriptive statistics'''
        
        print('Columns: ','\n'+'--'*25 + f'\n{prepared_df.columns}')
        print('--'*25)
        print('Info: ','\n'+'--'*25 + f'\n{prepared_df.info()}')
        print('--'*25)
        print('Unique: ','\n'+'--'*25 + f'\n{prepared_df.nunique()}')
        print('--'*25)
        print('Nulls: ', '\n'+'--'*25 + f'\n{prepared_df.isnull().sum()}')
        print('--'*25, '\n'+'--'*25)
        print('Describe: ', '\n'+'--'*25 + f'\n{prepared_df.describe()}')
        print('--'*25, '\n'+'--'*25)
        print('Head: ', '\n'+'--'*25 + f'\n{prepared_df.head()}')
        print('--'*25, '\n'+'--'*25)

        
    def rescale_data(self, X): 

        '''
        When your data is comprised of attributes with varying scales, many machine learning algorithms 
        can benefit from rescaling the attributes to all have the same scale.Often this is referred to as 
        normalization and attributes are often rescaled into the range between 0 and 1. This is useful for 
        optimization algorithms in used in the core of machine learning algorithms like gradient descent. 
        It is also useful for algorithms that weight inputs like regression and neural networks and algorithms 
        that use distance measures like K-Nearest Neighbors.

        Input: dataframe data to be used for features
        Return: scaled data
        '''

        scaler = MinMaxScaler(feature_range=(0, 1))
        rescaledX = scaler.fit_transform(self.X)

        # summarize transformed data
        np.set_printoptions(precision=3)
        print(rescaledX[0:5,:])

        return rescaledX


    def standardize_data(self, X):

        '''
        Standardize attributes with a Gaussian distribution and differing means and standard deviations
        to a standard Gaussian distribution with a mean of 0 and a standard deviation of 1. It is most suitable
        for techniques that assume a Gaussian distribution in the input variables and work better with rescaled 
        data, such as linear regression, logistic regression and linear discriminate analysis.

        Input: dataframe data to be used for features
        Return: standardized data
        '''

        stand_scaler = StandardScaler().fit(X)
        stand_rescaledX = stand_scaler.transform(self.X)

        # summarize transformed data
        np.set_printoptions(precision=3)
        print(stand_rescaledX[0:5,:])

        return stand_rescaledX


    def normalize_data(self, X):

        '''
        Rescale each observation (row) to have a length of 1 (called a unit norm in linear algebra). This 
        preprocessing can be useful for sparse datasets (lots of zeros) with attributes of varying scales when 
        using algorithms that weight input values such as neural networks and algorithms that use distance measures 
        such as K-Nearest Neighbors.

        Input: dataframe data to be used for features
        Return: normalized data
        '''

        norm_scaler = Normalizer().fit(self.X)
        normalizedX = norm_scaler.transform(self.X)

        # summarize transformed data
        np.set_printoptions(precision=3)
        print(normalizedX[0:5,:])

        return normalizedX
    

    def binarize_data(self, X):

        '''
        Transform data using a binary threshold. All values above the threshold are marked 1 and all
        equal to or below are marked as 0. This is called binarizing your data or threshold your data. It can
        be useful when you have probabilities that you want to make crisp values. It is also useful when feature
        engineering and you want to add new features that indicate something meaningful.

        Input: dataframe data to be used for features
        Return: binarized data
        '''

        binarizer = Binarizer(threshold=0.0).fit(self.X)
        binaryX = binarizer.transform(X)

        # summarize transformed data
        np.set_printoptions(precision=3)
        print(binaryX[0:5,:])

        return binaryX

### Tests

In [105]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
prep_obj = Prepare('data/pima-indians-diabetes.data copy.csv', 'class', names, 0) 
prepared_df, X, y = prep_obj.prepare_data_from_csv()

   preg  plas  pres  skin  test  mass   pedi  age  class
0     1    85    66    29     0  26.6  0.351   31      0
1     8   183    64     0     0  23.3  0.672   32      1
2     1    89    66    23    94  28.1  0.167   21      0
3     0   137    40    35   168  43.1  2.288   33      1
4     5   116    74     0     0  25.6  0.201   30      0


In [106]:
prep_obj.describe_data(prepared_df)

Columns:  
--------------------------------------------------
Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'], dtype='object')
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
preg     767 non-null int64
plas     767 non-null int64
pres     767 non-null int64
skin     767 non-null int64
test     767 non-null int64
mass     767 non-null float64
pedi     767 non-null float64
age      767 non-null int64
class    767 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.0 KB
Info:  
--------------------------------------------------
None
--------------------------------------------------
Unique:  
--------------------------------------------------
preg      17
plas     136
pres      47
skin      51
test     186
mass     248
pedi     516
age       52
class      2
dtype: int64
--------------------------------------------------
Nulls:  
--------------

In [85]:
#resc_x = prep_obj.rescale_data(X)

In [75]:
#stand_x = prep_obj.standardize_data(X)

In [74]:
#norm_x = prep_obj.normalize_data(X)

In [73]:
#bin_x = prep_obj.binarize_data(X)

In [96]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv('data/pima-indians-diabetes.data copy.csv', delimiter=',', header=None, names=names )

In [97]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
test     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
