##Cleansing Functions for String Columns

In [0]:
%run "../Util/Util Functions"

In [0]:
from pyspark.sql.functions import when, col
from pyspark.sql.functions import initcap, lower, upper
from pyspark.sql.types import StringType

import re
from rapidfuzz import fuzz

class CleanseString:
  """
  Methods for cleaning and validating dataframe columns of string type.
  
  Attributes
  ----------
  error_value : str
    Value to return on error of user defined function
  
  Methods
  -------
  
  """
  
  error_value = "$$_CleansingTransformationFailed_$$"
  
  # Cleaning Methods
  
  def cln_title_case(self, df, df_errors, columns: list, error_action: str, error_replace_value=None):
    """
    Converts all strings to Proper Case for each column listed in the dataframe.
    
    Parameters
    ----------
    df : DataFrame
    df_errors: DataFrame
      Spark DataFrame to write errors to
    columns : list
      List of string columns to convert to Proper Case
    error_action : str {'continue_and_null_value', 'continue_and_drop_row', 'continue_and_replace_value', 'stop'}
    error_replace_value : str
    
    Returns
    -------
    df_cleaned : DataFrame
      Cleaned Spark DataFrame
    df_errors : DataFrame
      DataFrame containing records where cleaning failed
    """
    function_name = "cln_title_case"
    
    def titleCase(string: str):
      newString = string.title()
      return newString
    
    #Example is initcap was a UDF, built in automatically return null
    titleCaseWrapper = util.safeUdf(titleCase, StringType(), self.error_value)
    
    for c_name in columns:
        df = df.withColumn(c_name+"__transformed__", titleCaseWrapper(col(c_name)))
        
    df_cleaned, df_errors = util.handleErrors(df, df_errors, columns, function_name, error_action, self.error_value, error_replace_value)
    
    return df_cleaned, df_errors

    
  def cln_lower_case(self, df, df_errors, columns: list, error_action: str, error_replace_value=None):
    """
    Converts all strings to lower case for each column listed in the dataframe.
    
    Parameters
    ----------
    df : DataFrame
    df_errors: DataFrame
    columns : list
      List of string columns to convert to lower case
    error_action : str {'continue_and_null_value', 'continue_and_drop_row', 'continue_and_replace_value', 'stop'}
    error_replace_value : str
    
    Returns
    -------
    df_cleaned : DataFrame
      Cleaned Spark DataFrame
    df_errors : DataFrame
      DataFrame containing records where cleaning failed
    """
    
    function_name = "cln_lower_case"
    
    def lowerCase(string: str):
      newString = string.lower()
      return newString
    
    #Example is initcap was a UDF, built in automatically return null
    lowerCaseWrapper = util.safeUdf(lowerCase, StringType(), self.error_value)  
    
    for c_name in columns:
        df = df.withColumn(c_name, lowerCaseWrapper(col(c_name)))
        
    df_cleaned, df_errors = util.handleErrors(df, df_errors, columns, function_name, error_action, self.error_value, error_replace_value)
    
    return df_cleaned, df_errors
    
    
  def cln_upper_case(self, df, df_errors, columns: list, error_action: str, error_replace_value=None):
    
    """
    Converts all strings to UPPER CASE for each column listed in the dataframe.
    
    Parameters
    ----------
    df : DataFrame
    df_errors: DataFrame
    columns : list
      List of string columns to convert to UPPER CASE
    error_action : str {'continue_and_null_value', 'continue_and_drop_row', 'continue_and_replace_value', 'stop'}
    error_replace_value : str
    
    Returns
    -------
    df_cleaned : DataFrame
      Cleaned Spark DataFrame
    df_errors : DataFrame
      DataFrame containing records where cleaning failed
    """
    function_name = "cln_upper_case"
    
    def upperCase(string: str):
      newString = string.upper()
      return newString
    
    #Example is initcap was a UDF, built in automatically return null
    upperCaseWrapper = util.safeUdf(upperCase, StringType(), self.error_value)  
    
    for c_name in columns:
        df = df.withColumn(c_name+"__transformed__", upperCaseWrapper(col(c_name)))
        
    df_cleaned, df_errors = util.handleErrors(df, df_errors, columns, function_name, error_action, self.error_value, error_replace_value)
    
    return df_cleaned, df_errors
  
  def cln_regex_replace(self, df, df_errors, columns: list, regex_exp: str, replace_string: str, error_action: str, error_replace_value=None):
    """
    Replaces values matching a regex expression with the specified value for each column listed in the dataframe.
    
    Parameters
    ----------
    df : DataFrame
    df_errors: DataFrame
    columns : list
      List of string columns to convert to UPPER CASE
    regex_exp : str
      Regular expression used to identify values to replace
    replace_string : str
      The value to replace strings matched by the regex
    error_action : str {'continue_and_null_value', 'continue_and_drop_row', 'continue_and_replace_value', 'stop'}
    error_replace_value : str
    
    Returns
    -------
    df_cleaned : DataFrame
      Cleaned Spark DataFrame
    df_errors : DataFrame
      DataFrame containing records where cleaning failed
    """    
    function_name = "cln_regex_replace"
    
    def reReplace(string: str):
      newString = re.sub(regex_exp, replace_string, string)
      return newString
    
    reReplaceWrapper = util.safeUdf(reReplace, StringType(), self.error_value)
    
    for c_name in columns:
      df = df.withColumn(c_name+"__transformed__", reReplaceWrapper(col(c_name)))
          
    df_cleaned, df_errors = util.handleErrors(df, df_errors, columns, function_name, error_action, self.error_value, error_replace_value)
    
    return df_cleaned, df_errors

  
    def cln_fuzzy_match_replace(self, df, df_errors, columns: list, common_strings: list, similarity_threshold: float, error_action: str, error_replace_value=None):
      """
      Replaces values that are similar to the first string in a common list for each column listed in the dataframe.
    
      Parameters
      ----------
      df : DataFrame
      df_errors: DataFrame
      columns : list
        List of string columns to convert to UPPER CASE
      common_strings : list
        List of common strings to fuzzy search for
      similarity_threshold : float [0,100]
        The threshold for similarity before replacing
      error_action : str {'continue_and_null_value', 'continue_and_drop_row', 'continue_and_replace_value', 'stop'}
      error_replace_value : str
    
      Returns
      -------
      df_cleaned : DataFrame
        Cleaned Spark DataFrame
      df_errors : DataFrame
        DataFrame containing records where cleaning failed
      """    
      function_name = "cln_fuzzy_match_replace"
      
      def fuzzyMatch(string: str):
        for common_string in common_strings:
          if fuzz.ratio(string, common_string, score_cutoff=similarity_threshold, processor=None):
            newString = common_string
          else:
              new_string = string
          return new_string
    
      fuzzyMatchWrapper = util.safeUdf(fuzzyMatch, StringType(), self.error_value)
    
      for c_name in columns:
        df = df.withColumn(c_name, fuzzyMatchWrapper(col(c_name)))
          
      df_cleaned, df_errors = util.handleErrors(df, df_errors, columns, function_name, error_action, self.error_value, error_replace_value)
    
      return df_cleaned, df_errors
  
  # TODO: Validating Methods