In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats

In [2]:
# some global variables

data_filepath = "./data/"


In [3]:
def is_int(x):
    try: 
        int(x)
        return True
    except ValueError:
        return False

In [4]:
def NONE(X):
    """Return the values - placeholder function for other operations"""
    return X

In [5]:
# Encoding Functions

def MAP(X):
    """Map all values to integer numbers."""
    """NaN values are treated as a unique value."""
    
    # create an encoding for categorical vars
    unique_elems = set(X)
    mapping = {label:idx for idx, label in enumerate(unique_elems)}
    return X.map(mapping).astype(int)

def LOO(X):
    """Perform Leave One Out counting for the features."""
    
    # map features to ordinal values first
    X = MAP(X)
    
    # perform counts
    mapping = {idx:(count-1) for idx, count in enumerate(np.bincount(X))}
    return X.map(mapping).astype(int)
    

def OHE(df, col, idx):
    """Map categorical values to a one hot encoding scheme."""
    
    # NEEDS TO BE IMPLEMENTED
    
    # remove the selected column from the df
    
    
    # perform one-hot encoding
    
    
    # create new column names of form oldcolname-1, oldcolname-2, ...
    
    
    # add the cols back to the df
    
    return df


In [6]:
# Scaling Functions

def NRM1(X):
    """Scale by dividing by the 1-norm"""
    norm = np.linalg.norm(X, ord=1)
    return X / norm

def SCL1(X):
    """Scale between (-1, 1)"""
    mean = X.mean()
    maximum = X.max()
    minimum = X.min()
    return (X - mean) / (maximum - minimum)

In [7]:
# Imputing Functions

def UNIQ(X, value=-1):
    """Replace missing Values with unique value"""
    
    X.fillna(value=value, inplace=True)    
    return X

def MEAN(X):
    """Replace missing values with the mean of the others"""
    
    mean = np.mean(X)
    X.fillna(value=mean, inplace=True)
    return X

def MED(X):
    """Replace missing values with median of data"""
    
    median = np.nanmedian(X)
    X.fillna(value=median, inplace=True)
    return X

def CONST(X, value=0):
    """Replace missing values with a constant."""
    
    X.fillna(value=value, inplace=True)
    return X

def MODE(X):
    """Replace missing values with the mode."""
    
    mode = stats.mode(X)[0][0]
    X.fillna(value=mode, inplace=True)
    return X

In [8]:
class Preprocessor:
    
    def __init__(self, train_data_file, train_label_file, train_ids_file,
                 instr_file, test_data_file=None, test_ids_file=None):
        """A class to process and reformat data
        for use in learning models"""
        
        # initialize the data the data filenames
        self.train_data_file = train_data_file
        self.train_label_file = train_label_file
        self.train_ids_file = train_ids_file
        self.instr_file = instr_file
        
        # test data is optional
        self.test_data_file = test_data_file
        self.test_ids_file = test_ids_file
        
    def read_data(self):
        """Reads in data from the files passed to constructor"""
        
        # read in the data
        train_X_df = pd.read_csv(self.train_data_file)
        train_y_df = pd.read_csv(self.train_label_file)
        train_ids_df = pd.read_csv(self.train_ids_file)
        self.instr_df = pd.read_csv(self.instr_file)
        
        self.feature_names = [feature for feature in train_X_df]
        self.label_names = [feature for feature in train_y_df]
        self.id_names = [feature for feature in train_ids_df]
        
        # create cross validation data
        self.cv_X_df = pd.DataFrame(train_X_df)
        self.cv_y_df = pd.DataFrame(train_y_df)
        self.cv_ids_df = pd.DataFrame(train_ids_df)
        
        # read in the test data if it exists
        if self.test_data_file != None:
            self.test_X_df = pd.read_csv(self.test_data_file)
            self.test_ids_df = pd.read_csv(self.test_ids_file)
            self.all_X_df = train_X_df.append(self.test_X_df)
        else:
            self.test_X_df = None
            self.test_ids_df = None
            self.all_X_df = pd.DataFrame(train_X_df)
        
        # determine the shape of the input data
        self.train_X_shape = train_X_df.shape
        self.train_y_shape = train_y_df.shape
        self.train_ids_shape = train_ids_df.shape
        self.instr_shape = self.instr_df.shape
        self.all_shape = self.all_X_df.shape
        
        # get size of test data if it exists
        if self.test_data_file != None:
            self.test_X_shape = self.test_X_df.shape
            self.test_ids_shape = self.test_ids_df.shape
        else:
            self.test_X_shape = None
            self.test_ids_shape = None

        
    def process(self):
        """Performs the processing on cross validation and train/test data"""
        
        # processing on all data - remember to include cv_X and all_X for each condition
        for idx, col in enumerate(self.feature_names):
            
            # determine what to perform at each of the steps
            col_instr = self.instr_df[col].values
            col_enc = col_instr[1]
            col_scl = col_instr[2]
            col_imp = col_instr[3]

            # impute values
            # imputed first so that other functions will not use nan values in calculations
            if col_imp == 'UNIQ':
                self.cv_X_df[col] = UNIQ(self.cv_X_df[col], value=-1)
                self.all_X_df[col] = UNIQ(self.all_X_df[col], value=-1)
            if col_imp == 'MEAN':
                self.cv_X_df[col] = MEAN(self.cv_X_df[col])
                self.all_X_df[col] = MEAN(self.all_X_df[col])
            if col_imp == 'MODE':
                self.cv_X_df[col] = MODE(self.cv_X_df[col])
                self.all_X_df[col] = MODE(self.all_X_df[col])
            if col_imp == 'MED':
                self.cv_X_df[col] = MED(self.cv_X_df[col])
                self.all_X_df[col] = MED(self.all_X_df[col])
            if is_int(col_imp):
                self.cv_X_df[col] = CONST(self.cv_X_df[col], col_imp)
                self.all_X_df[col] = CONST(self.all_X_df[col], col_imp)
            
            
            # perform encoding of data
            if col_enc == 'MAP':
                self.cv_X_df[col] = MAP(self.cv_X_df[col])
                self.all_X_df[col] = MAP(self.all_X_df[col])
            if col_enc == 'OHE':
                self.cv_X_df = OHE(self.cv_X_df, col, idx)
                self.all_X_df = OHE(self.all_X_df, col, idx)
            if col_enc == 'LOO':
                self.cv_X_df[col] = LOO(self.cv_X_df[col])
                self.all_X_df[col] = LOO(self.all_X_df[col])
            

            # perform scaling
            if col_scl == 'NRM1':
                self.cv_X_df[col] = NRM1(self.cv_X_df[col])
                self.all_X_df[col] = NRM1(self.all_X_df[col])
            if col_scl == 'SCL1':
                self.cv_X_df[col] = SCL1(self.cv_X_df[col])
                self.all_X_df[col] = SCL1(self.all_X_df[col])

        
        # get the values from the dataframes
        self.cv_X = self.cv_X_df.values
        self.cv_y = self.cv_y_df.values
        self.cv_ids = self.cv_ids_df.values
        
        all_X = self.all_X_df.values
        self.train_X = all_X[:self.train_X_shape[0], :]
        self.train_y = self.cv_y_df.values
        self.train_ids = self.cv_ids_df.values
        
        if self.test_data_file != None:
            self.test_X = all_X[self.train_X_shape[0]:, :]
            self.test_ids = self.test_ids_df.values
        else:
            self.test_X = None
            self.test_ids = None
        
    def write_data(self, out_dir='./processed_data/'):
        """Writes all of the data to output files"""
        
        # create the output directory if it does not exist
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            
        # convert arrays back into DataFrames
        cv_X_df = pd.DataFrame(self.cv_X,  columns=self.feature_names)
        cv_y_df = pd.DataFrame(self.cv_y, columns=self.label_names)
        cv_ids_df = pd.DataFrame(self.cv_ids, columns=self.id_names)
        train_X_df = pd.DataFrame(self.train_X, columns=self.feature_names)
        train_y_df = pd.DataFrame(self.train_y, columns=self.label_names)
        train_ids_df = pd.DataFrame(self.train_ids, columns=self.id_names)
        if self.test_data_file != None:
            test_X_df = pd.DataFrame(self.test_X, columns=self.feature_names)
            test_ids_df = pd.DataFrame(self.test_ids, columns=self.id_names)
        
        # write the dataframes to file
        cv_X_df.to_csv(out_dir+'cv_X.csv', index=False)
        cv_y_df.to_csv(out_dir+'cv_y.csv', index=False)
        cv_ids_df.to_csv(out_dir+'cv_ids.csv', index=False)
        train_X_df.to_csv(out_dir+'train_X.csv', index=False)
        train_y_df.to_csv(out_dir+'train_y.csv', index=False)
        train_ids_df.to_csv(out_dir+'train_ids.csv', index=False)
        if self.test_data_file != None:
            test_X_df.to_csv(out_dir+'test_X.csv', index=False)
            test_ids_df.to_csv(out_dir+'test_ids.csv', index=False)
        
    def select_features(self):
        """Perform features selection / compression algs like PCA."""
        """These will be implemented once more has been done."""
        self.feature_names = self.feature_names

In [9]:
# some simple testing code and such
train_data = data_filepath+'houseprices_data_train.csv'
train_labels = data_filepath+'houseprices_labels_train.csv'
train_ids = data_filepath+'houseprices_ids_train.csv'
test_data = data_filepath+'houseprices_data_test.csv'
test_ids = data_filepath+'houseprices_ids_test.csv'
description = data_filepath+'houseprices_feature_descriptions.csv'

proc = Preprocessor(train_data_file=train_data,
                 train_label_file=train_labels,
                 train_ids_file=train_ids,
                 test_data_file=test_data,
                 test_ids_file=test_ids,
                 instr_file=description)

proc.read_data()

proc.process()

# doesn't do anything yet, hasn't been implemented
proc.select_features()

# data is written to output directory
# any existing data is overwritten
proc.write_data()
