####Checks - SameValuesCols | UniqValuesCols | NullValuesCols

In [0]:
# imports
# hides all warnings
import warnings
warnings.filterwarnings('ignore')
# import pandas for spark
import pyspark.pandas as ps
# pandas 
import pandas as pd
# numpy
import numpy as np
# variance
from pyspark.sql.functions import variance

In [0]:
# identify columns where all value are same (uniqVals=1)
# identify columns where two or less values are unique (uniqVals=2)
# identify columns where three or less values are unique (uniqVals=3)
# works only where all cols are numeric
def SameValuesCols(df, lExclCols=[], uniqVals=1, Verbose = False):
    # orig col names
    colNames = df.columns.to_list()
    # if not list convert to list
    if not isinstance(lExclCols, list):
        lExclCols = [lExclCols]
    #print(lExclCols)
    # if not empty, create a dataframe of ExclCols
    if lExclCols != []:
        for vExclCol in lExclCols:
            colNames.remove(vExclCol)
    # handle same value for each col
    lRetVals = []
    dsRetValue = pd.Series() 
    for colName in colNames:
        cntUniq = df[colName].nunique()
        #cntRecs  = len(df.index)
        dsRetValue[colName] = '%7d' % cntUniq
        if (cntUniq <= uniqVals):
            lRetVals.append(colName)
    if (Verbose):       
        print(dsRetValue)    
    return lRetVals

In [0]:
# identify columns with more than 100% unique values
def UniqValuesCols(df, lExclCols=[], Percent=0.95, Verbose = False):
    # orig col names
    colNames = df.columns.to_list()
    # if not list convert to list
    if not isinstance(lExclCols, list):
        lExclCols = [lExclCols]
    #print(lExclCols)
    # if not empty, create a dataframe of ExclCols
    if lExclCols != []:
        for vExclCol in lExclCols:
            colNames.remove(vExclCol)
    # handle uniq values for each col
    dsRetValue = pd.Series() 
    lRetVals = []
    for colName in colNames:
        cntUniq = df[colName].nunique()
        cntRecs  = len(df.index)
        perRecs  = cntUniq / cntRecs
        dsRetValue[colName] = '%.2f' % perRecs
        if perRecs >= Percent:
            lRetVals.append(colName)
    if (Verbose):       
        print(dsRetValue)    
    return lRetVals


In [0]:
# identify columns with more than 50% null values
def NullValuesCols(df, lExclCols=[], Percent=0.50, Verbose = False):
    # currently can check only 100% same values so Percent has to 1 (100%)
    if (Percent < 0) & (Percent>1) :
        Percent=0.5
    # orig col names
    colNames = df.columns.to_list()
    #print(colNames)
    #print(type(colNames))
    # if not list convert to list
    if not isinstance(lExclCols, list):
        lExclCols = [lExclCols]
    #print(lExclCols)
    # if not empty, create a dataframe of ExclCols
    if lExclCols != []:
        for vExclCol in lExclCols:
            colNames.remove(vExclCol)
    # handle null values for each col
    dsRetValue = pd.Series() 
    lRetVals = []
    for colName in colNames:
        cntNulls = df[colName].isnull().sum()
        cntRecs  = len(df.index)
        perRecs  = cntNulls / cntRecs
        #print(colName)
        #print(perRecs)
        #print(Percent)
        if perRecs >= Percent:
            lRetVals.append(colName)
        dsRetValue[colName] = '%.2f' % perRecs
    if (Verbose):       
        print(dsRetValue)    
    return (lRetVals)

In [0]:
# Read CSV

# File location and type
file_location = "/FileStore/tables/test/california_housing.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [0]:
# convert dataframe to pandas spark dataframe
psdf = ps.DataFrame(df)
print(psdf.head(5))

In [0]:
# info
print(psdf.info())

In [0]:
# drop ocean proximity
# del col
psdf = psdf.drop('ocean_proximity', axis=1)
print(psdf.info())


In [0]:
# drop col if all the values (100%) are same
print("\n*** Same Value Cols Drop ***")
lDropCols = SameValuesCols(psdf, "median_house_value", 1, True)
print(lDropCols)
#if lDropCols != []:
#    df = df.drop(lDropCols, axis=1)
print("Done ...")

In [0]:

# drop col if contains 100% unique values
print("\n*** Uniq Value Cols Drop ***")
lDropCols = UniqValuesCols(psdf, "median_house_value", 1)
print(lDropCols)
#if lDropCols != []:
#    df = df.drop(lDropCols, axis=1)
print("Done ...")



In [0]:
# drop col if more than 50% null values
print("\n*** Null Value Cols Drop ***")
lDropCols = NullValuesCols(psdf, "median_house_value", 0.50)
print(lDropCols)
#if lDropCols != []:
#    psdf = psdf.drop(lDropCols, axis=1)
print("Done ...")
