# Define configurations


Here, user need to provide some data for the program to run. 

The key feature will be the fill_up_methods.

fill_up_methods takes a list of dicsitonaires, each dictionary provide instruction about what is the method to use to fill up the missing data. 
For now, there are 4 methods - median, mean, fixed_value and most_frequent valye.
Other features such as forward/backword filling, fill last also can be added in. 
In future, we can also support fill up missed data with AI algorithms such as KNN or possible thing.

The result will alwasy be a complete data set with all missed data filled up / dropped, but with as less bias as possible

In [56]:
# Will ignore everything else, only focus on the columns of interest
#columns_of_interest=['Team', 'Number', 'Position', 'Age', 'Height', 'Weight', 'Salary', 'Some Error']
columns_of_interest = None  # Defualt to None

# Works reversely as columns_of_interest, ignore those columns.
# It should not use together with column of interest
columns_to_ignore = ['Name']  # Default to None

# Data source url
data_src = "https://media.geeksforgeeks.org/wp-content/uploads/nba.csv"
# data_src="~/Desktop/iris.data"

# Row index set to ignore
rows_to_ignore = [457]

# Set defaul method to fill up the missing data
default_fill_up_method = 'median'

# If provided csv data doesn't contains headers, provide as a list of strings
# headers=['sepal_length',	'sepal_width',	'petal_length',	'petal_width',	'species']  # Default None
headers=None

# Set individual column missing data filling method
# Methods : median, mean, fixed_value, most_frequent
fill_up_methods = [{'column': 'Salary', 'method': 'mean'},
                   {'column': 'College', 'method': 'fixed_value', 'value': 'Unknown'},
                   {'column': 'Team', 'method': 'most_frequent'},
                   {'column': 'Position', 'method': 'most_frequent'}]

# Set the data normalization method, None to not normalize
# Reference of data normilization : https://www.geeksforgeeks.org/data-normalization-with-pandas/
# 'maximum_absolute_scaling', 'min_max_feature_scaling', 'z_score'
normalization_method = 'maximum_absolute_scaling'  # Default to None

# Request schema from user
# Category type (hierarchy / no hierarchy)
# Ordinal / with order

# TODO: Standardalization


# Read Data


In [57]:
import pandas as pd
from pandas.api.types import is_numeric_dtype


pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', None)

if (headers is not None):
	data = pd.read_csv(data_src, names=headers)
else:
	data = pd.read_csv(data_src)

# Pre-process data


In [58]:
def feet_inch_to_cm(s):
    if (isinstance(s, str)):
        feet = s.split('-')[0]
        inch = s.split('-')[1]
        cm = float(feet) * 30.4800 + float(inch) * 2.54
        return cm
    else:
        return s


### Convert string value of inch to number (cm)


In [59]:
# This only applicapable to the basketball data set
data['Height'] = data['Height'].apply(feet_inch_to_cm)


### Focus on the columns of interest


In [60]:
# A container for all the error-messages in the description
missing_columns_of_interests = []

# Check if there is error in conlums_of_interest, if user interested
# in a column that doesn't exist, record that to generate error message
if (columns_of_interest is not None):
    for column in columns_of_interest:
        if not column in data.columns:
            missing_columns_of_interests.append(column)

    # Loop through all columns in data frame, drop those columns that is
    # not interested
    for column in data.columns:
        if not column in columns_of_interest:
            data.drop(column, inplace=True, axis=1)

# Ignore the columns to ignore
if (columns_to_ignore is not None):
    for column in columns_to_ignore:
        if column in data.columns:
            data.drop(column, inplace=True, axis=1)


# Fill up missing data for each column


### Define fill up methods


In [61]:
def median(d):
    if is_numeric_dtype(d):
        d.fillna(d.median(), inplace=True)


def mean(d):
    if is_numeric_dtype(d):
        d.fillna(round(d.mean(), 2), inplace=True)


def most_frequent(d):
    d.fillna(d.mode(dropna=True)[0], inplace=True)


def fixed_value(d, v):
    d.fillna(v, inplace=True)


all_fill_up_methods = {
    'median': median,
    'mean': mean,
    'fixed_value': fixed_value,
    'most_frequent': most_frequent
}


### Fill up each columns based on the provided method


This part is a bit tricky, it will be too much for user to fill in if we request saperate numeric method and string method.
But if user only supply one method, it is hard to solve all different types.


In [62]:
fill_up_method_columns = []

for fill_up_item in fill_up_methods:
    fill_up_method_columns.append(fill_up_item['column'])


for column in data.columns:
    if column in fill_up_method_columns:
        # Handing column specified fill up method
        idx = fill_up_method_columns.index(column)
        method = fill_up_methods[idx]['method']

        if (method == 'fixed_value'):
            value = fill_up_methods[idx]['value']
            all_fill_up_methods[method](data[column], value)
        else:
            all_fill_up_methods[method](data[column])

    # If column specific fill_up_method is not provided
    else:
        all_fill_up_methods[default_fill_up_method](data[column])

# If below still returns any value, means the fill-up method configuration
# is not good enough, need to support more clear rules.
if (data[data.isna().any(axis=1)].shape[0] != 0):
    raise Warning(
        'There are still missing values in data set no been filled up, consider provide more clear fill_up_methods rules')


### Flat all none numerical columns


The function here seems cannor process very large set of data, I think need to use num py to flat the none-numerical data, but don't know how yet.


In [63]:
def check_is_same(a, b):
    if (a == b):
        return 1
    else:
        return 0


for column in data.columns:
    # Thiw will only apply to none-numeric data
    if not is_numeric_dtype(data[column]):
        # Get unique values for each none-numerical column
        unique_data = data[column].unique()

        # Loop through each item in the unique data,
        # each item in unique data will become a column
        for unique_column in unique_data:
            unique_column_df = pd.DataFrame(columns=[unique_column])

            # Assign data column to the data frame, and apply a function
            # to check if the current colums is same as the main dataframe row value
            unique_column_df[unique_column] = data[column].apply(
                lambda row: check_is_same(row, unique_column))

            # data[unique_column] = unique_column_df
            data = pd.concat((data, unique_column_df), axis=1)

        data.drop(column, inplace=True, axis=1)

# Post porcess - Normalize data


In [64]:
def maximum_absolute_scaling(_d):
    d = _d.copy()
    for column in d:
        d[column] = d[column]/d[column].abs().max()
    return d


def min_max_feature_scaling(_d):
    d = _d.copy()
    for column in d.columns:
        d[column] = (d[column] - d[column].min()) / \
            (d[column].max() - d[column].min())


def z_score(_d):
    d = _d.copy()
    for column in d.columns:
        d[column] = (d[column] -
                     d[column].mean()) / d[column].std()


all_normalization_methods = {
    'maximum_absolute_scaling': maximum_absolute_scaling}


In [65]:
final_result = data
if normalization_method is not None:
    final_result = all_normalization_methods[normalization_method](data)


In [66]:
final_result


Unnamed: 0,Number,Age,Height,Weight,Salary,Boston Celtics,Brooklyn Nets,New York Knicks,Philadelphia 76ers,Toronto Raptors,...,Colorado State,Virginia Tech,DePaul,Morehead State,Central Michigan,Weber State,Lehigh,Westchester CC,Dayton,Butler
0,0.0,0.625,0.850575,0.586319,0.309213,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.625,0.896552,0.765472,0.271845,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.30303,0.675,0.885057,0.667752,0.193707,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.282828,0.55,0.885057,0.602606,0.045946,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.080808,0.725,0.942529,0.752443,0.2,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.909091,0.725,0.931034,0.781759,0.48,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.555556,0.525,0.91954,0.765472,0.046838,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.414141,0.625,0.965517,0.775244,0.086606,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.121212,0.55,0.850575,0.618893,0.072974,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.363636,0.55,0.873563,0.716612,0.137242,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
