####Scaling - Normalization & Standardization

In [0]:
# imports
# hides all warnings
import warnings
warnings.filterwarnings('ignore')
# import pandas for spark
import pyspark.pandas as ps
# pandas 
import pandas as pd
# numpy
import numpy as np

In [0]:
# handle nulls
# data frame handle nulls replace with ReplVals of the columns as per replBy vars
"""
desc:
    HandleNulls - handle nulls from all cols in df except exclCols 
                nulls being replaced replBy value
usage: 
    HandleNulls(df, replBy, colClass) 
params:
    df datarame, 
    replBy - mean, median, minimum (of mean & median), maximum (of mean & median) 
    exclCols - col to ignore while transformation 
"""
def HandleNulls(df, replBy, lExclCols=[]):
    # orig col names
    colNames = df.columns.to_list()
    # if not list convert to list
    if not isinstance(lExclCols, list):
        lExclCols = [lExclCols]
    #print(lExclCols)
    # if not empty, create a dataframe of ExclCols
    if lExclCols != []:
        for vExclCol in lExclCols:
            colNames.remove(vExclCol)
    # handle outlier for each col
    for colName in colNames:
        if ((df[colName].isnull()).sum() > 0):
            if (replBy == "mean"):
                replVals = df[colName].mean()
            elif (replBy == "median"):
                replVals = df[colName].median()
            elif (replBy == "minimum"):
                replVals = min(df[colName].mean(),df[colName].median())
            elif (replBy == "maximum"):
                replVals = max(df[colName].mean(),df[colName].median())
            # replace
            df[colName] = df[colName].fillna(replVals)
    return df

# data frame handle nulls replace with mean of the columns 
def HandleNullsWithMean(df, lExclCols=[]):
    df = HandleNulls(df, "mean", lExclCols)
    return df

# data frame handle nulls replace with median of the columns 
def HandleNullsWithMedian(df, lExclCols=[]):
    df = HandleNulls(df, "median", lExclCols)
    return df

# data frame handle nulls replace with min(mean,median) of the columns 
def HandleNullsWithMinOfMM(df, lExclCols=[]):
    df = HandleNulls(df, "minimum", lExclCols)
    return df

# data frame handle nulls replace with max(mean,median) of the columns 
def HandleNullsWithMaxOfMM(df, lExclCols=[]):
    df = HandleNulls(df, "maximum", lExclCols)
    return df

In [0]:
# normalization - single col
# x_scaled = (x-min(x)) / (max(x)–min(x))
def colNormalization(colName,colValues):
    #print(colName)
    min = colValues.min()
    max = colValues.max()
    colValues = (colValues-min)/(max-min)
    return colValues

# standardization - single col
# x_scaled = (x — mean(x)) / stddev(x)
def colStandardization(colName,colValues):
    #print(colName)
    mean = colValues.mean()
    std = colValues.std()
    colValues = (colValues-mean)/(std)
    return colValues

# scale data
"""
desc:
    normalize data - all cols of df will be Normalized except lExclCols
    x_scaled = (x-min(x)) / (max(x)–min(x))
    all values will be between 0 & 1 or if -ve values are present then -1 & 1
usage: 
    ScaleData(df, 'N', lExclCols) 
params:
    df datarame, type, lExclCols - cols to ignore while transformation  

desc:
    standardise data - all cols of df will be Standarized except lExclCols
    x_scaled = (x — mean(x)) / stddev(x)
    transforms data to have a mean of zero and a standard deviation of 1
    ScaleData(df, 'S', lExclCols) 
params:
    df datarame, type, lExclCols - cols to ignore while transformation  
"""
def ScaleData(df, type='N', lExclCols=[]):
    # chech type
    if (type != 'N') & (type != 'S'):
        type = 'N'
    # orig col names
    colNames = df.columns.to_list()
    # if not list convert to list
    if not isinstance(lExclCols, list):
        lExclCols = [lExclCols]
    #print(lExclCols)
    # if not empty, create a dataframe of ExclCols
    if lExclCols != []:
        for vExclCol in lExclCols:
            colNames.remove(vExclCol)
    # handle outlier for each col
    for colName in colNames:
        if (df[colName].dtypes == 'object'):
            continue
        if (type == 'N'):
            colValues = colNormalization(colName,df[colName].values)
        if (type == 'S'):
            colValues = colStandardization(colName,df[colName].values)
        df[colName] = colValues.tolist()
    
    return(df)


In [0]:
# Read CSV

# File location and type
file_location = "/FileStore/tables/test/california_housing.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [0]:
# convert dataframe to pandas spark dataframe
psdf = ps.DataFrame(df)
print(psdf.head(5))

In [0]:
# info
print(psdf.info())

In [0]:
# drop ocean proximity
# del col
psdf = psdf.drop('ocean_proximity', axis=1)
print(psdf.info())

In [0]:
# check nulls
print('\n*** Columns With Nulls ***')
print(psdf.isnull().sum()) 

In [0]:
# handle nulls if required
print('\n*** Handle Nulls ***')
psdf = HandleNullsWithMean(psdf)
print("Done ...")

In [0]:
# check nulls
print('\n*** Columns With Nulls ***')
print(psdf.isnull().sum()) 

**Normailze Data**

In [0]:
# copy dataframe
psdfn = psdf.copy()

In [0]:
# check mean
print('\n*** Mean In Columns ***')
print(psdfn.mean())

In [0]:
# handle normalization
print('\n*** Normalize Data ***')
psdfn = ScaleData(psdfn, 'N', ['ser','latitude','longitude','median_house_value'])
print('Done ...')

In [0]:
# check mean
print('\n*** Mean In Columns ***')
print(psdfn.mean())

**Standardize Data**

In [0]:
# copy dataframe
psdfs = psdf.copy()

In [0]:
# check mean
print('\n*** Mean In Columns ***')
print(psdfs.mean())

In [0]:
# handle standardization
print('\n*** Standardize Data ***')
psdfs = ScaleData(psdfs, 'S', ['ser','latitude','longitude','median_house_value'])
print('Done ...')

In [0]:
# check mean
print('\n*** Mean In Columns ***')
print(psdfs.mean())