In [0]:
%run "../Util/Util Functions"

In [0]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

import random

class ClenseNumeric:
  """
  Methods for cleaning and validating dataframe columns of numeric type.
  
  Attributes
  ----------
  ErrorValue : float
    Value to return on error of user defined function
  
  Methods
  -------
  
  """
  
  ErrorValue = "Failed Numeric Cleansing"
  
  def clnMinMaxNormalise(df, columns: list, ErrorAction: str, ErrorValue, ErrorReplaceValue=None):
    """
    Applies Min Max Normalisation to all values in a column for each column listed in the dataframe.
    
    Parameters
    ----------
    df : DataFrame
    columns : list
      List of string columns to normalise
    ErrorAction : str {'ContinueAndNullValue', 'ContinueAndDropRow', ContinueAndReplaceValue, 'Stop'}
    ErrorValue
    ErrorReplaceValue : str
    
    Returns
    -------
    df_cleaned : DataFrame
      Cleaned Spark DataFrame
    df_errors : DataFrame
      DataFrame containing records where cleaning failed
    """
    
    # UDF for converting column type from vector to double type
    unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())

    # Iterating over columns to be scaled
    for c_name in columns:
      # VectorAssembler Transformation - Converting column to vector type
      assembler = VectorAssembler(inputCols=[c_name],outputCol=c_name+"_Vect")
      # MinMaxScaler Transformation
      scaler = MinMaxScaler(inputCol=c_name+"_Vect", outputCol=c_name+"_Scaled")
      # Pipeline of VectorAssembler and MinMaxScaler
      pipeline = Pipeline(stages=[assembler, scaler])
      # Fitting pipeline on dataframe
      df = pipeline.fit(df).transform(df).withColumn(c_name+"_Scaled", unlist(c_name+"_Scaled")).drop(c_name+"_Vect")
    
    # blank df for error handling
    df_blank = spark.createDataFrame([], df.schema)
    
    return df, df_blank