In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels as sm
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer

# Prepare data for Machine Learning

In [25]:
#Scripts for preprocessing data for machine learning
#Borrowed generously from https://machinelearningmastery.com/prepare-data-machine-learning-python-scikit-learn/


def prepare_data_from_csv(file_path_as_string, y_column_name, column_names=None, header=0):
    
    '''Import and prepare data'''
    
    dataframe = pd.read_csv(file_path_as_string, names=column_names, delimiter=',',header=header)
    if header != None:
        dataframe.columns = [x.replace(' ', '_') for x in dataframe.columns]
    print(dataframe.head())
    
    X = dataframe.drop(y_column_name, axis=1)
    y = dataframe[y_column_name]
    
    prepared_df = dataframe
    return prepared_df, X, y


def describe_data(prepared_df):
    
    ''' Print shape and descriptive statistics'''
    
    print('Shape: ','\n'+'--'*25 + f'\n{prepared_df.shape}')
    print('--'*25)
    print('Nulls: ', '\n'+'--'*25 + f'\n{prepared_df.isnull().sum()}')
    print('--'*25, '\n'+'--'*25)
    print('Describe: ', '\n'+'--'*25 + f'\n{prepared_df.describe()}')
    print('--'*25, '\n'+'--'*25)
    
def rescale_data(X): 
    
    '''When your data is comprised of attributes with varying scales, many machine learning algorithms can benefit 
    from rescaling the attributes to all have the same scale.Often this is referred to as normalization and attributes
    are often rescaled into the range between 0 and 1. This is useful for optimization algorithms in used in the core
    of machine learning algorithms like gradient descent. It is also useful for algorithms that weight inputs like 
    regression and neural networks and algorithms that use distance measures like K-Nearest Neighbors.
    
    Input: dataframe data to be used for features
    Return: scaled data
'''
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    rescaledX = scaler.fit_transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(rescaledX[0:5,:])
    
    return rescaledX


def standardize_data(X):
    
    '''Standardization is a useful technique to transform attributes with a Gaussian distribution and differing 
    means and standard deviations to a standard Gaussian distribution with a mean of 0 and a standard deviation of 
    1.It is most suitable for techniques that assume a Gaussian distribution in the input variables and work better 
    with rescaled data, such as linear regression, logistic regression and linear discriminate analysis.
    
    Input: dataframe data to be used for features
    Return: standardized data
    '''
    
    stand_scaler = StandardScaler().fit(X)
    stand_rescaledX = stand_scaler.transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(stand_rescaledX[0:5,:])
    
    return stand_rescaledX

    
    
def normalize_data(X):
    
    '''rescaling each observation (row) to have a length of 1 (called a unit norm in linear algebra). This 
    preprocessing can be useful for sparse datasets (lots of zeros) with attributes of varying scales when 
    using algorithms that weight input values such as neural networks and algorithms that use distance measures 
    such as K-Nearest Neighbors.
    
    Input: dataframe data to be used for features
    Return: normalized data'''
    
    norm_scaler = Normalizer().fit(X)
    normalizedX = norm_scaler.transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(normalizedX[0:5,:])
    
    return normalizedX
    
def binarize_data(X):
    
    '''You can transform your data using a binary threshold. All values above the threshold are marked 1 and all
    equal to or below are marked as 0. This is called binarizing your data or threshold your data. It can be useful
    when you have probabilities that you want to make crisp values. It is also useful when feature engineering and 
    you want to add new features that indicate something meaningful.
    
    Input: dataframe data to be used for features
    Return: binarized data
    '''
    
    binarizer = Binarizer(threshold=0.0).fit(X)
    binaryX = binarizer.transform(X)
    
    # summarize transformed data
    np.set_printoptions(precision=3)
    print(binaryX[0:5,:])
    
    return binaryX

### Tests

In [32]:
# names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
# prepared_df, X, y = prepare_data_from_csv('pima-indians-diabetes.data copy.csv', 'class', column_names=names, header=0)

In [33]:
#describe_data(prepared_df)

In [34]:
#resc_x = rescale_data(X)

In [35]:
#stand_x = standardize_data(X)

In [36]:
#norm_x = normalize_data(X)

In [37]:
#bin_x = binarize_data(X)