In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# some global variables

data_filepath = "./data/"


In [3]:
def NONE(X):
    """Return the values - placeholder function for other operations"""
    return X

In [4]:
# Encoding Functions

def MAP(X):
    """Map all values to integer numbers."""
    """NaN values are treated as a unique value."""
    
    # create an encoding for categorical vars
    unique_elems = set(X)
    mapping = {label:idx for idx, label in enumerate(unique_elems)}
    return X.map(mapping).astype(int)

def OHE(df, col):
    """Map categorical values to a one hot encoding scheme."""
    
    # NEEDS TO BE IMPLEMENTED
    
    # remove the selected column from the df
    
    
    # perform one-hot encoding
    
    
    # create new column names of form oldcolname-1, oldcolname-2, ...
    
    
    # add the cols back to the df
    
    return df


In [5]:
# Scaling Functions

def NRM1(X):
    """Scale by dividing by the 1-norm"""
    norm = np.linalg.norm(X, ord=1)
    return X / norm

In [6]:
# Imputing Functions

def UNIQ(X, value=-1):
    """Replace missing Values with unique value"""
    
    X.fillna(value=value, inplace=True)    
    return X

def MEAN(X):
    """Replace missing values with the mean of the others"""
    
    mean = np.mean(X)
    X.fillna(value=mean, inplace=True)
    return X

In [9]:
class Preprocessor:
    
    def __init__(self, train_data_file, train_label_file, train_ids_file,
                 instr_file, test_data_file=None, test_ids_file=None):
        """A class to process and reformat data
        for use in learning models"""
        
        # initialize the data the data filenames
        self.train_data_file = train_data_file
        self.train_label_file = train_label_file
        self.train_ids_file = train_ids_file
        self.instr_file = instr_file
        
        # test data is optional
        self.test_data_file = test_data_file
        self.test_ids_file = test_ids_file
        
        # initialize this to features names from the train data
        self.features = []
        
    def read_data(self):
        """Reads in data from the files passed to constructor"""
        
        # read in the data
        train_X_df = pd.read_csv(self.train_data_file)
        train_y_df = pd.read_csv(self.train_label_file)
        train_ids_df = pd.read_csv(self.train_ids_file)
        self.instr_df = pd.read_csv(self.instr_file)
        
        # create cross validation data
        self.cv_X_df = pd.DataFrame(train_X_df)
        self.cv_y_df = pd.DataFrame(train_y_df)
        self.cv_ids_df = pd.DataFrame(train_ids_df)
        
        # read in the test data if it exists
        if self.test_data_file != None:
            self.test_X_df = pd.read_csv(self.test_data_file)
            self.test_ids_df = pd.read_csv(self.test_ids_file)
            self.all_X_df = train_X_df.append(self.test_X_df)
        else:
            self.test_X_df = None
            self.test_ids_df = None
            self.all_X_df = pd.DataFrame(train_X_df)
        
        # determine the shape of the input data
        self.train_X_shape = train_X_df.shape
        self.train_y_shape = train_y_df.shape
        self.train_ids_shape = train_ids_df.shape
        self.instr_shape = self.instr_df.shape
        self.all_shape = self.all_X_df.shape
        
        # get size of test data if it exists
        if self.test_data_file != None:
            self.test_X_shape = self.test_X_df.shape
            self.test_ids_shape = self.test_ids_df.shape
        else:
            self.test_X_shape = None
            self.test_ids_shape = None

        
    def process(self):
        """Performs the processing on cross validation and train/test data"""
        
        # processing on all data - remember to include cv_X and all_X for each condition
        for col in self.cv_X_df:
            
            # determine what to perform at each of the steps
            col_instr = self.instr_df[col].values
            col_enc = col_instr[1]
            col_scl = col_instr[2]
            col_imp = col_instr[3]

            # impute values
            # imputed first so that other functions will not use nan values in calculations
            if col_imp == 'UNIQ':
                self.cv_X_df[col] = UNIQ(self.cv_X_df[col], value=-1)
                self.all_X_df[col] = UNIQ(self.all_X_df[col], value=-1)
            if col_imp == 'MEAN':
                self.cv_X_df[col] = MEAN(self.cv_X_df[col])
                self.all_X_df[col] = MEAN(self.all_X_df[col])
            
            
            # perform encoding of data
            if col_enc == 'MAP':
                self.cv_X_df[col] = MAP(self.cv_X_df[col])
                self.all_X_df[col] = MAP(self.cv_X_df[col])
            if col_enc == 'OHE':
                self.cv_X_df = OHE(self.cv_X_df, col)
                self.all_X_df = OHE(self.all_X_df, col)
            

            # perform scaling
            if col_scl == 'NRM1':
                self.cv_X_df[col] = NRM1(self.cv_X_df[col])
                self.all_X_df[col] = NRM1(self.all_X_df[col])

        
        # get the values from the dataframes
        self.cv_X = self.cv_X_df.values
        self.cv_y = self.cv_y_df.values
        self.cv_ids = self.cv_ids_df.values
        
        all_X = self.all_X_df.values
        self.train_X = all_X[:self.train_X_shape[0], :]
        self.train_y = self.cv_y_df.values
        self.train_ids = self.cv_ids_df.values
        
        if self.test_data_file != None:
            self.test_X = all_X[self.train_X_shape[0]:, :]
            self.test_ids = self.test_ids_df.values
        else:
            self.test_X = None
            self.test_ids = None

        
    def write_data(self, out_dir='./processed_data/'):
        """Writes all of the data to output files"""
        
        # create the output directory if it does not exist
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            
        # TODO: 
        #   - CREATE A WAY TO ADD BACK THE FEATURE NAMES
            
        # convert arrays back into DataFrames
        cv_X_df = pd.DataFrame(self.cv_X)
        cv_y_df = pd.DataFrame(self.cv_y)
        cv_ids_df = pd.DataFrame(self.cv_ids)
        train_X_df = pd.DataFrame(self.train_X)
        train_y_df = pd.DataFrame(self.train_y)
        train_ids_df = pd.DataFrame(self.train_ids)
        if self.test_data_file != None:
            test_X_df = pd.DataFrame(self.test_X)
            test_ids_df = pd.DataFrame(self.test_ids)
        
        # write the dataframes to file
        cv_X_df.to_csv(out_dir+'cv_X.csv')
        cv_y_df.to_csv(out_dir+'cv_y.csv')
        cv_ids_df.to_csv(out_dir+'cv_ids.csv')
        train_X_df.to_csv(out_dir+'train_X.csv')
        train_y_df.to_csv(out_dir+'train_y.csv')
        train_ids_df.to_csv(out_dir+'train_ids.csv')
        if self.test_data_file != None:
            test_X_df.to_csv(out_dir+'test_X.csv')
            test_ids_df.to_csv(out_dir+'test_ids.csv')
        
        
    def select_features(self):
        """Perform features selection / compression algs like PCA."""
        """These will be implemented once more has been done."""
        self.features = self.features

In [10]:
# some simple testing code and such
train_data = data_filepath+'example_data_train.csv'
train_labels = data_filepath+'example_labels_train.csv'
train_ids = data_filepath+'example_ids_train.csv'
test_data = data_filepath+'example_data_test.csv'
test_ids = data_filepath+'example_ids_test.csv'
description = data_filepath+'FeatureDescriptions.csv'

proc = Preprocessor(train_data_file=train_data,
                 train_label_file=train_labels,
                 train_ids_file=train_ids,
                 test_data_file=test_data,
                 test_ids_file=test_ids,
                 instr_file=description)

proc.read_data()

proc.process()

# doesn't do anything yet, hasn't been implemented
proc.select_features()

# data is written to output directory
# any existing data is overwritten
proc.write_data()
