In [2]:
from numpy import nan, zeros
from pandas import read_csv, get_dummies, Series


class Automation:
    def __init__(self):
        self.name = 'ABC'
    
    def clean(self,filename):
        """Load and clean user-supplied data."""

        print('CLEANING DATA....')

        # load data as a pandas data frame
        df = self.load_csv(filename)

        # get lists of fields that are continuous and discrete
        real = [i for i in range(len(df.iloc[0])) if type(df.iloc[0, i]) != str]
        discrete = [i for i in range(len(df.iloc[0])) if type(df.iloc[0, i]) == str]

        # interpolate missing data values
        print('\tDetecting missing values...')
        df = self.replace_missing_data(df)
        print('\tImputing missing values...')
        df = self.interpolate_missing_data(df, real, discrete)

        print('\tRemoving outliers...')
        df = self.remove_outliers(df, real)

        print('\tTransforming categorical data using one-hot encoding...')
        df = self.one_hot_encode(df)

        # save cleaned data file to same directory as uncleaned version
        df.to_csv('CLEAN.csv')

        print('DONE.')
    
    def load_csv(self,filePath, missing_headers=False):
        """Read data as csv and return as pandas data frame."""

        if missing_headers:
            data = read_csv(filePath, header=None)
        else:
            data = read_csv(filePath, header=0)

        # make shape of data frame global
        global rows, cols
        rows, cols = data.shape

        return data

    def one_hot_encode(self,data):
        """Perform a one-hot encoding and return as pandas data frame."""

        return get_dummies(data)
    def replace_missing_data(self,data):
        """replace missing data values and return as pandas data frame."""

        # strip whitespace from data
        data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

        # replace missing values with the sentinel NaN value
        data = data.replace('?', nan)

        # get missing field count
        nan_vals = dict(data.count(axis=1))
        nan_vals = {key: value for (key, value) in nan_vals.items() if value < cols-2}

        # remove samples with more than one missing field
        data = data.drop(index=nan_vals.keys())

        return data
    def interpolate_missing_data(self,data, real, discrete):
        """Interpolate missing data and return as pandas data frame."""

        # get mean of real-valued fields and mode for categorical fields
        mode = data.mode().values.flatten()
        mean = data.mean().values.flatten()

        # keep ONLY the categorical modes
        mode = [x for x in mode.copy() if type(x) == str]

        replacements = list(zeros(15))

        # get mean replacements for continuous fields
        j = 0
        for index in real:
            replacements[index] = mean[j]
            j += 1

        # get mode replacements for discrete fields
        j = 0
        for index in discrete:
            replacements[index] = mode[j]
            j += 1

        # fill NaN values with mode (discrete fields) and mean (continuous fields)
        data = data.fillna(Series(replacements))

        return data

    def remove_outliers(self,data, real):
        """Remove outliers from data and return as a pandas data frame."""
        # get field mean and std for real-valued fields
        mean = data.describe().iloc[1, :]
        std = data.describe().iloc[2, :]

        # remove outliers
        for (real, mean, std) in zip(real, mean, std):
            data = data[data[real] < 3*std + mean]

        return data

In [3]:
if __name__ == '__main__':
    obj = Automation()
    obj.clean('car.csv')  

CLEANING DATA....
	Detecting missing values...
	Imputing missing values...
	Removing outliers...
	Transforming categorical data using one-hot encoding...
DONE.
