<h1>Data Processing Model</h1>

In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats

from sklearn.preprocessing import OneHotEncoder

In [2]:
# some global variables

data_filepath = "./data/"


In [3]:
def is_int(x):
    try: 
        int(x)
        return True
    except ValueError:
        return False

In [4]:
def NONE(X):
    """Return the values - placeholder function for other operations"""
    return X

In [5]:
# Encoding Functions

def MAP(X):
    """Map all values to integer numbers."""
    """NaN values are treated as a unique value."""
    
    # create an encoding for categorical vars
    unique_elems = set(X)
    mapping = {label:idx for idx, label in enumerate(unique_elems)}
    return X.map(mapping).astype(int)

def LOO(X):
    """Perform Leave One Out counting for the features."""
    
    # map features to ordinal values first
    X = MAP(X)
    
    # perform counts
    mapping = {idx:(count-1) for idx, count in enumerate(np.bincount(X))}
    return X.map(mapping).astype(int)
    

def OHE(df_cv, df_all, col_name, feature_names, feature_threshold=0.02):
    """Map categorical values to a one hot encoding scheme."""
    
    X_cv = MAP(df_cv[col_name])
    X_all = MAP(df_all[col_name])
    
    X_cv = X_cv.values.reshape(-1, 1)
    X_all = X_all.values.reshape(-1, 1)
    OHE = OneHotEncoder(sparse=False).fit(X_all)
    X_cv_ohe = OHE.transform(X_cv)
    X_all_ohe = OHE.transform(X_all)
    
    low_freq_features = []
    for i in range(X_all_ohe.shape[1]):
        new_feature = col_name + str(i)
        
        # determine the frequency of the categorical data value
        freq = np.sum(X_all_ohe[:, i]) / X_all_ohe.shape[0]
        if freq > feature_threshold:
            df_cv[new_feature] = X_cv_ohe[:, i]
            df_all[new_feature] = X_all_ohe[:, i]
            feature_names.append(new_feature)
        else:
            low_freq_features.append(i)
    
    # aggregate low frequency features
    if len(low_freq_features) > 0:
        extra_label = col_name + str(X_all_ohe.shape[1])
        feature_names.append(extra_label)
        
        X_all_extra = np.array([0 for x in range(X_all.shape[0])])
        X_cv_extra = np.array([0 for x in range(X_cv.shape[0])])
        
        for i in low_freq_features:
            for idx, val in enumerate(X_all_ohe[:, i]):
                if val == 1:
                    X_all_extra[idx] = 1
            for idx, val in enumerate(X_cv_ohe[:, i]):
                if val == 1:
                    X_cv_extra[idx] = 1
        
        df_cv[extra_label] = X_cv_extra
        df_all[extra_label] = X_all_extra                    
            
    feature_names.remove(col_name)
    df_cv = df_cv.drop(col_name, axis=1)
    df_all = df_all.drop(col_name, axis=1)
    
    return df_cv, df_all, feature_names



In [6]:
# Scaling Functions

def NRM1(X):
    """Scale by dividing by the 1-norm"""
    norm = np.linalg.norm(X, ord=1)
    return X / norm

def SCL1(X):
    """Scale between (-1, 1)"""
    mean = X.mean()
    maximum = X.max()
    minimum = X.min()
    return (X - mean) / (maximum - minimum)

def TRSH(X, threshold_value=20):
    X = [0 if val < threshold_value else 1 for val in X]
    return X

In [7]:
# Imputing Functions

def UNIQ(X, value=-1):
    """Replace missing Values with unique value"""
    
    X.fillna(value=value, inplace=True)    
    return X

def MEAN(X):
    """Replace missing values with the mean of the others"""
    
    mean = np.mean(X)
    X.fillna(value=mean, inplace=True)
    return X

def MED(X):
    """Replace missing values with median of data"""
    
    median = np.nanmedian(X)
    X.fillna(value=median, inplace=True)
    return X

def CONST(X, value=0):
    """Replace missing values with a constant."""
    
    X.fillna(value=int(value), inplace=True)
    return X

def MODE(X):
    """Replace missing values with the mode."""
    
    mode = stats.mode(X)[0][0]
    X.fillna(value=mode, inplace=True)
    return X

def DEL(df_cv, df_all, col_name, feature_names):
    df_cv = df_cv.drop(col_name, axis=1)
    df_all = df_all.drop(col_name, axis=1)
    feature_names.remove(col_name)
    
    return df_cv, df_all, feature_names

In [8]:
class Preprocessor:
    
    def __init__(self, train_data_file, train_label_file, train_ids_file,
                 instr_file, test_data_file=None, test_ids_file=None):
        """A class to process and reformat data
        for use in learning models"""
        
        # initialize the data the data filenames
        self.train_data_file = train_data_file
        self.train_label_file = train_label_file
        self.train_ids_file = train_ids_file
        self.instr_file = instr_file
        
        # test data is optional
        self.test_data_file = test_data_file
        self.test_ids_file = test_ids_file
        
    def read_data(self):
        """Reads in data from the files passed to constructor"""
        
        # read in the data
        train_X_df = pd.read_csv(self.train_data_file)
        train_y_df = pd.read_csv(self.train_label_file)
        train_ids_df = pd.read_csv(self.train_ids_file)
        self.instr_df = pd.read_csv(self.instr_file)
        
        self.feature_names = [feature for feature in train_X_df]
        self.original_feature_names = [feature for feature in train_X_df]
        self.label_names = [feature for feature in train_y_df]
        self.id_names = [feature for feature in train_ids_df]
        
        # create cross validation data
        self.cv_X_df = pd.DataFrame(train_X_df)
        self.cv_y_df = pd.DataFrame(train_y_df)
        self.cv_ids_df = pd.DataFrame(train_ids_df)
        
        # read in the test data if it exists
        if self.test_data_file != None:
            self.test_X_df = pd.read_csv(self.test_data_file)
            self.test_ids_df = pd.read_csv(self.test_ids_file)
            self.all_X_df = train_X_df.append(self.test_X_df)
        else:
            self.test_X_df = None
            self.test_ids_df = None
            self.all_X_df = pd.DataFrame(train_X_df)
        
        # determine the shape of the input data
        self.train_X_shape = train_X_df.shape
        self.train_y_shape = train_y_df.shape
        self.train_ids_shape = train_ids_df.shape
        self.instr_shape = self.instr_df.shape
        self.all_shape = self.all_X_df.shape
        
        # get size of test data if it exists
        if self.test_data_file != None:
            self.test_X_shape = self.test_X_df.shape
            self.test_ids_shape = self.test_ids_df.shape
        else:
            self.test_X_shape = None
            self.test_ids_shape = None

        
    def process(self, shuffle_train_data=False):
        """Performs the processing on cross validation and train/test data"""
        
        # ADD OPTION TO SHUFFLE DATA HERE
        
        # processing on all data - remember to include cv_X and all_X for each condition
        for col in self.original_feature_names:
            print(col)
            
            # determine what to perform at each of the steps
            col_instr = self.instr_df[col].values
            col_enc = col_instr[1]
            col_scl = col_instr[2]
            col_imp = col_instr[3]

            # impute values
            # imputed first so that other functions will not use nan values in calculations
            if col_imp == 'UNIQ':
                self.cv_X_df[col] = UNIQ(self.cv_X_df[col], value=-1)
                self.all_X_df[col] = UNIQ(self.all_X_df[col], value=-1)
            if col_imp == 'MEAN':
                self.cv_X_df[col] = MEAN(self.cv_X_df[col])
                self.all_X_df[col] = MEAN(self.all_X_df[col])
            if col_imp == 'MODE':
                self.cv_X_df[col] = MODE(self.cv_X_df[col])
                self.all_X_df[col] = MODE(self.all_X_df[col])
            if col_imp == 'MED':
                self.cv_X_df[col] = MED(self.cv_X_df[col])
                self.all_X_df[col] = MED(self.all_X_df[col])
            if is_int(col_imp):
                self.cv_X_df[col] = CONST(self.cv_X_df[col], col_imp)
                self.all_X_df[col] = CONST(self.all_X_df[col], col_imp)
            if col_imp == 'DEL':
                self.cv_X_df, self.all_X_df, self.feature_names = DEL(
                    self.cv_X_df, self.all_X_df, col, self.feature_names)
            
            
            # perform encoding of data
            if col_enc == 'MAP':
                self.cv_X_df[col] = MAP(self.cv_X_df[col])
                self.all_X_df[col] = MAP(self.all_X_df[col])
            if col_enc == 'OHE':
                self.cv_X_df, self.all_X_df, self.feature_names = OHE(
                    df_cv=self.cv_X_df, df_all=self.all_X_df, col_name=col, 
                    feature_names=self.feature_names)
            if col_enc == 'LOO':
                self.cv_X_df[col] = LOO(self.cv_X_df[col])
                self.all_X_df[col] = LOO(self.all_X_df[col])
            

            # perform scaling
            if col_scl == 'NRM1':
                self.cv_X_df[col] = NRM1(self.cv_X_df[col])
                self.all_X_df[col] = NRM1(self.all_X_df[col])
            if col_scl == 'SCL1':
                self.cv_X_df[col] = SCL1(self.cv_X_df[col])
                self.all_X_df[col] = SCL1(self.all_X_df[col])
            if col_scl == 'TRSH':
                self.cv_X_df[col] = TRSH(self.cv_X_df[col])
                self.all_X_df[col] = TRSH(self.all_X_df[col])

        
        # get the values from the dataframes
        self.cv_X = self.cv_X_df.values
        self.cv_y = self.cv_y_df.values
        self.cv_ids = self.cv_ids_df.values
        
        all_X = self.all_X_df.values
        self.train_X = all_X[:self.train_X_shape[0], :]
        self.train_y = self.cv_y_df.values
        self.train_ids = self.cv_ids_df.values
        
        if self.test_data_file != None:
            self.test_X = all_X[self.train_X_shape[0]:, :]
            self.test_ids = self.test_ids_df.values
        else:
            self.test_X = None
            self.test_ids = None
        
    def write_data(self, out_dir='./processed_data/'):
        """Writes all of the data to output files"""
        
        # create the output directory if it does not exist
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            
        # convert arrays back into DataFrames
        cv_X_df = pd.DataFrame(self.cv_X,  columns=self.feature_names)
        cv_y_df = pd.DataFrame(self.cv_y, columns=self.label_names)
        cv_ids_df = pd.DataFrame(self.cv_ids, columns=self.id_names)
        train_X_df = pd.DataFrame(self.train_X, columns=self.feature_names)
        train_y_df = pd.DataFrame(self.train_y, columns=self.label_names)
        train_ids_df = pd.DataFrame(self.train_ids, columns=self.id_names)
        if self.test_data_file != None:
            test_X_df = pd.DataFrame(self.test_X, columns=self.feature_names)
            test_ids_df = pd.DataFrame(self.test_ids, columns=self.id_names)
        
        # write the dataframes to file
        cv_X_df.to_csv(out_dir+'cv_X.csv', index=False)
        cv_y_df.to_csv(out_dir+'cv_y.csv', index=False)
        cv_ids_df.to_csv(out_dir+'cv_ids.csv', index=False)
        train_X_df.to_csv(out_dir+'train_X.csv', index=False)
        train_y_df.to_csv(out_dir+'train_y.csv', index=False)
        train_ids_df.to_csv(out_dir+'train_ids.csv', index=False)
        if self.test_data_file != None:
            test_X_df.to_csv(out_dir+'test_X.csv', index=False)
            test_ids_df.to_csv(out_dir+'test_ids.csv', index=False)
        
    def select_features(self):
        """Perform features selection / compression algs like PCA."""
        """These will be implemented once more has been done."""
        self.feature_names = self.feature_names

In [9]:
# some simple testing code and such
dataset = 'mnist'
train_data = data_filepath+dataset+'_data_train.csv'
train_labels = data_filepath+dataset+'_labels_train.csv'
train_ids = data_filepath+dataset+'_ids_train.csv'
test_data = data_filepath+dataset+'_data_test.csv'
test_ids = data_filepath+dataset+'_ids_test.csv'
description = data_filepath+dataset+'_feature_descriptions.csv'

proc = Preprocessor(train_data_file=train_data,
                 train_label_file=train_labels,
                 train_ids_file=train_ids,
                 test_data_file=test_data,
                 test_ids_file=test_ids,
                 instr_file=description)

proc.read_data()

proc.process()

# doesn't do anything yet, hasn't been implemented
proc.select_features()

# data is written to output directory
# any existing data is overwritten
proc.write_data()


pixel0
pixel1
pixel2
pixel3
pixel4
pixel5
pixel6
pixel7
pixel8
pixel9
pixel10
pixel11
pixel12
pixel13
pixel14
pixel15
pixel16
pixel17
pixel18
pixel19
pixel20
pixel21
pixel22
pixel23
pixel24
pixel25
pixel26
pixel27
pixel28
pixel29
pixel30
pixel31
pixel32
pixel33
pixel34
pixel35
pixel36
pixel37
pixel38
pixel39
pixel40
pixel41
pixel42
pixel43
pixel44
pixel45
pixel46
pixel47
pixel48
pixel49
pixel50
pixel51
pixel52
pixel53
pixel54
pixel55
pixel56
pixel57
pixel58
pixel59
pixel60
pixel61
pixel62
pixel63
pixel64
pixel65
pixel66
pixel67
pixel68
pixel69
pixel70
pixel71
pixel72
pixel73
pixel74
pixel75
pixel76
pixel77
pixel78
pixel79
pixel80
pixel81
pixel82
pixel83
pixel84
pixel85
pixel86
pixel87
pixel88
pixel89
pixel90
pixel91
pixel92
pixel93
pixel94
pixel95
pixel96
pixel97
pixel98
pixel99
pixel100
pixel101
pixel102
pixel103
pixel104
pixel105
pixel106
pixel107
pixel108
pixel109
pixel110
pixel111
pixel112
pixel113
pixel114
pixel115
pixel116
pixel117
pixel118
pixel119
pixel120
pixel121
pixel122
pix