In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from utils.common_transformers import NullPct, OutlierHandler, IsNull

from sklearn.feature_selection import VarianceThreshold

from altair import Chart,X,Y

# Remove Duplicates

In [2]:
df = pd.DataFrame({
    "A" : [1,1,3,4],
    "B" : [2,2,3,4],
    "C" : [3,3,4,5]
})

duplicated = df[df.duplicated()]
df.drop(duplicated.index,inplace=True)

display(duplicated)

Unnamed: 0,A,B,C
1,1,2,3


In [3]:
df = pd.DataFrame({
    "ID" : [1,2,3,4,5,6,7],
    "A" : [1,4,8,None,32,64,500],
    "B" : [1,8,27,65,125,255,300]
})

# Outliers

In [4]:
df = OutlierHandler(exclude=["ID"]).fit_transform(df)
df.tail()

Unnamed: 0,ID,A,B,A_is_outlier,B_is_outlier
2,3,8.0,27,False,False
3,4,,65,False,False
4,5,32.0,125,False,False
5,6,64.0,255,False,False
6,7,500.0,300,True,False


# Null Analysis

In [5]:
print("Columns")
pd.DataFrame(df.isnull().mean() * 100,columns=["NullPerc"]).query("NullPerc>0")

Columns


Unnamed: 0,NullPerc
A,14.285714


In [6]:
df = NullPct().fit_transform(df)

In [7]:
Chart(df).mark_bar().encode(
    Y("null_pct:N"),
    X("count(null_pct):Q")
)

In [8]:
df = IsNull(exclude=["ID"]).fit_transform(df)

In [9]:
df.head()

Unnamed: 0,ID,A,B,A_is_outlier,B_is_outlier,null_pct,A_is_null,B_is_null
0,1,1.0,1,False,False,0.0,False,False
1,2,4.0,8,False,False,0.0,False,False
2,3,8.0,27,False,False,0.0,False,False
3,4,,65,False,False,33.33,True,False
4,5,32.0,125,False,False,0.0,False,False


In [10]:
var_thresh = VarianceThreshold()
var_thresh.fit(df)
cols_removed = [j for i,j in enumerate(list(df.columns)) if not var_thresh.get_support()[i]]

print("Cols Removed: ", cols_removed)

df = df.iloc[:,var_thresh.get_support()]
df

Cols Removed:  ['B_is_outlier', 'B_is_null']


Unnamed: 0,ID,A,B,A_is_outlier,null_pct,A_is_null
0,1,1.0,1,False,0.0,False
1,2,4.0,8,False,0.0,False
2,3,8.0,27,False,0.0,False
3,4,,65,False,33.33,True
4,5,32.0,125,False,0.0,False
5,6,64.0,255,False,0.0,False
6,7,500.0,300,True,0.0,False


In [11]:
null_cols = pd.DataFrame(df.isnull().sum(),columns=["Nulls"]).query("Nulls>0").index.to_list()
print("Rows where col value is null")

for col in null_cols:
    print("Column: " + col)
    display(df[df[col].isnull()])

Rows where col value is null
Column: A


Unnamed: 0,ID,A,B,A_is_outlier,null_pct,A_is_null
3,4,,65,False,33.33,True


In [12]:
print("Rows With More than a single null value in a row")
df.query(f"null_pct>{100/(df.shape[1]-1)}")

Rows With More than a single null value in a row


Unnamed: 0,ID,A,B,A_is_outlier,null_pct,A_is_null
3,4,,65,False,33.33,True
