In [304]:
import numpy as np 
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

In [517]:
train_data = pd.read_csv('train.csv.zip')

In [518]:
float_col = list(train_data.select_dtypes(include=['float','float32','float64']).columns)
object_col = list(train_data.select_dtypes(include=['object']).columns)

In [519]:
## Check and filter variables with missing values beyond criteria

del_criteria = 0.8

def iden_missing_val(df, del_criteria = 0.8):
    
    col_del_1 = [i for i in df.columns if df[i].isnull().sum()/len(df)>=del_criteria]
    col_complete = [i for i in df.columns if i not in col_del_1]
    
    print(f"Variables with missing value more than {del_criteria*100}% is \n {col_del_1}" )
    output = df[col_complete]
    return output, col_del_1

train_df_1, col_del_1 = iden_missing_val(train_data)

Variables with missing value more than 80.0% is 
 ['stem-root', 'veil-type', 'veil-color', 'spore-print-color']


In [520]:
float_col_1 = [i for i in float_col if i not in col_del_1]
object_col_1 = [i for i in object_col if i not in col_del_1]

In [507]:
## Check and filter error numeric values in object variables - set up func for repeatable checks

def var_unique_val (df):
    object_col = list(df.select_dtypes(include=['object']).columns)
    output = pd.DataFrame([df[i].unique() for i in object_col]).T
    output.columns = object_col
    return output

def var_top_val (df, col_name, top_rk=10):
    object_col = list(df[[col_name]].select_dtypes(include=['object']).columns)
    output = pd.DataFrame()
    for i in object_col:
        col = df[i].value_counts().to_frame().sort_values(by=['count'],ascending=False).head(top_rk)
        col = col.reset_index()
        col['count'] = round(col['count'],2)
    output = pd.concat([output, col], ignore_index=True, axis=1)
    output.columns = ['val', 'count']
    return output

In [516]:
def clean_object_col_by_step(df, col_name, uci_ref, low_counts_criteria=100):

    ## step 0: find all invalid entries per uci reference
    uci_ref = list(set(uci_ref))
    invalids = [i for i in df[col_name].unique() if i not in uci_ref]
    var_val = var_top_val (df, col_name, top_rk=len(df[col_name].unique()))

    ## step 1: clean np.nan into unknown to make processing easier
    nan_replace = {np.nan: 'unknown'}
    nan_replacer = nan_replace.get
    invalids = [nan_replacer(i,i) for i in invalids]
    uci_ref = [nan_replacer(i,i) for i in uci_ref]

    ## step 2: replace numeric values in the categorical variable
    def find_numerics(list):    
        for i in list:
            try:
                yield float(i)
            except ValueError:
                pass
    numerics_float = list(find_numerics(invalids))
    numerics_str =list(str(i) for i in numerics_float)
    numerics_str += [str(int(i)) for i in numerics_float if str(i) not in invalids]
    numeric_replace = dict(zip(numerics_str, ['error']*len(numerics_str)))

    ## step 3: identify single alphabet entries that are not included in uci reference
        ### note that some entries are not included in uci but have great footprints in the data
        ### will include in case there are meanings to them, but will record into outputs to further investigate
    singles = [i for i in invalids if i not in numerics_str and len(i)==1]
    low_counts = list(var_val[var_val['count'] < 100]['val'])
    low_singles = [i for i in singles if i in low_counts not in uci_ref]
    low_singles_replace = dict(zip(low_singles, ['other']*len(low_singles)))
    high_singles = [i for i in singles if i not in low_counts] ## return for investigation

    ## step 4: identify entries that were recorded incorrectly potentially due to formatting
    formatted = []
    corrected = []
    for i in invalids:
        for j in uci_ref:
            if ' '+j in i:
                formatted += [i]
                corrected += [j]
    format_replace = dict(zip(formatted, corrected))

    ## step 5: all other undefined long strings
    replaced = ['unknown', 'error','other']
    undefined_str = [i for i in invalids if i not in numerics_str+singles+formatted+replaced 
                     and isinstance(i, str)]
    undefined_str_replace = dict(zip(undefined_str, ['error']*len(undefined_str)))

    final_replace = {**nan_replace, **numeric_replace, **low_singles_replace, **format_replace, 
                     **undefined_str_replace}
    df[col_name] = df[col_name].replace(final_replace, inplace=False)

    return high_singles, final_replace, df[col_name]

In [509]:
def arrays_to_df (arrays, col_names, fill_with=np.nan):
    max_length = max(len(i) for i in arrays)
    output = pd.DataFrame([i + [fill_with]*(max_length-len(i)) for i in arrays]).T
    output.columns = col_names
    return output

In [510]:
uci_cap_shape = ['b', 'c', 'x', 'f', 'k', 's']
uci_cap_surface = ['f', 'g', 'y', 's']
uci_cap_color = ['n', 'b', 'c', 'g', 'r', 'p', 'u', 'e', 'w', 'y']
uci_bruises = ['t', 'f']
uci_gill_att = ['a', 'd', 'f', 'n']
uci_gill_spac = ['c', 'w', 'd']
uci_gill_color =  ['k', 'n', 'b', 'h', 'g', 'r', 'o', 'p', 'u', 'e', 'w', 'y']
uci_stem_surf = ['f', 'y', 'k', 's']
uci_stem_color = ['n', 'b', 'c', 'g', 'o', 'p', 'e', 'w', 'y']
uci_has_ring = ['n', 'o', 't']
uci_ring_type = ['c', 'e', 'f', 'l', 'n', 'p', 's', 'z']
uci_habitat = ['g', 'l', 'm', 'p', 'u', 'w', 'd']
season = ['a', 'w', 'u', 's'] ## uci does not have this attribute, added by the task hence also less noise

## merge all lists into one big list
ref_arrays = [uci_cap_shape, uci_cap_surface, uci_cap_color, uci_bruises, uci_gill_att, uci_gill_spac,
              uci_gill_color, uci_stem_surf, uci_stem_color, uci_has_ring, uci_ring_type, uci_habitat, season]

object_col_1_ex = [i for i in object_col_1 if i not in ['class']]
uci_refs = arrays_to_df(ref_arrays, object_col_1_ex)

In [521]:
invalid_high_singles = [0] * len(object_col_1_ex)
replaces_by_object = [0] * len(object_col_1_ex)

train_df_2 = train_df_1.copy()
for ind, i in enumerate(object_col_1_ex):
    invalid_high_singles[ind], replaces_by_object[ind], train_df_2[i] = clean_object_col_by_step(
                                                                       train_df_1, i, uci_refs[i])

In [524]:
invalid_high_singles = arrays_to_df(invalid_high_singles, object_col_1_ex)
invalid_high_singles

Unnamed: 0,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,p,h,o,,x,f,f,t,u,f,g,h,
1,o,l,k,,s,,,g,f,,r,,
2,,t,l,,e,,,h,r,,m,,
3,,e,,,p,,,i,k,,,,
4,,d,,,,,,,l,,,,
5,,i,,,,,,,,,,,
6,,w,,,,,,,,,,,
7,,k,,,,,,,,,,,


In [522]:
def get_replaced_dict(replaces_by_object, col_name, object_cols =object_col_1_ex):
    print(f'{col_name}:\n {replaces_by_object[object_cols.index(col_name)]}')

get_replaced_dict(replaces_by_object, 'ring-type')

ring-type:
 {nan: 'unknown', '4.0': 'error', '15.0': 'error', '3.12': 'error', '11.0': 'error', '23.6': 'error', '1.0': 'error', '14.0': 'error', '2.0': 'error', '2.87': 'error', '8.25': 'error', '4': 'error', '15': 'error', '11': 'error', '1': 'error', '14': 'error', '2': 'error', 'y': 'other', 'h': 'other', 'o': 'other', 't': 'other', 'a': 'other', 'd': 'other', 'x': 'other', 'b': 'other', 'u': 'other', 'w': 'other', 'i': 'other', 'k': 'other', 'does f': 'f', 'ring-type': 'error', 'season': 'error', 'does-bruise-or-bleed': 'error', 'spore-print-color': 'error', 'class': 'error', 'sp': 'error'}


In [526]:
train_data['has-ring'].value_counts()

has-ring
f             2368820
t              747982
r                  16
h                  13
c                  11
l                  11
s                  11
p                  11
g                   8
z                   6
e                   6
x                   5
m                   4
y                   3
d                   3
k                   2
o                   2
n                   2
f has-ring          1
i                   1
10.3                1
w                   1
a                   1
Name: count, dtype: int64