##Summary 

We treated those columns with a few unique values (less than 60) as categorical data. Originally, these data were not standardized in the data preprocessing step. This notebook is for standardization of categorical data with numerical dtypes. 

Note that these cat_num data also contain those columns which are in object dtype but with too many number of unique values. Those columns were encoded using label encoder and were combined with cat_num data, as the last three columns. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
#import seaborn as sns
%matplotlib inline
import requests
#from pattern import web
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn import cross_validation
import gc
from sklearn import metrics
from scipy import stats
from sklearn.base import TransformerMixin
from datetime import datetime as dt
from math import isnan
from numpy import ma

In [2]:
from scipy.sparse import hstack,csr_matrix
import pickle

In [3]:
with open('pickledata/cat_numeric_th60_train2.dat','rb') as infile1:
    cat_num_train = pickle.load(infile1)

In [4]:
with open('pickledata/cat_numeric_th60_test2.dat','rb') as infile2:
    cat_num_test = pickle.load(infile2)

In [5]:
cat_num_train.shape

(145231, 244)

### Shift feature values to positive

In [7]:
class ShiftPostive(TransformerMixin):
    def __init__(self):
        """
        shift selected numerical columns to positive value
        """
        
    def fit(self,X, y=None):
        return self
    
    def test_int(self,colvector):
        """
        test if the float dtype columns have no non-integer values other than NA
        input: column vector
        output: boolean, True if the column has no non-integer value other than NA
        """
        return colvector[colvector.notnull()].apply(lambda x:x.is_integer()).sum() == len(colvector[colvector.notnull()])

    def transform(self,X,y=None, int_amount = 1, deci_amount = 0.1):
        # separate out integer vs float-valued columns
        int_cols = X.dtypes[X.dtypes == np.dtype('int64')].index.tolist()
        int_cols2 = X.dtypes[X.dtypes == np.dtype('int32')].index.tolist()
        int_cols  = int_cols + int_cols2
        float_cols = X.dtypes[X.dtypes == np.dtype('float64')].index.tolist()
        int_with_nans_bool = X[float_cols].apply(self.test_int)
        int_with_nans = int_with_nans_bool[int_with_nans_bool].index.tolist()
        int_cols.extend(int_with_nans)
        float_cols = list(set(float_cols).difference(set(int_with_nans)))
        # shift integer-valued columns and float valued columns separately
        new_int_df = X[int_cols].apply(lambda x: x - x.min() + int_amount if x.min() <= 0 else x)
        new_float_df = X[float_cols].apply(lambda x: x - x.min() + deci_amount if x.min() <= 0 else x)
        return pd.merge(new_int_df,new_float_df,left_index=True,right_index=True),int_cols,float_cols

In [8]:
shiftpos = ShiftPostive()

In [9]:
cat_num_train_shifted,int_cols1,float_cols1 = shiftpos.transform(cat_num_train)

In [10]:
cat_num_test_shifted,int_cols2,float_cols2 = shiftpos.transform(cat_num_test)

###Log transform to highly right-skewed features

In [11]:
class LogTransform(TransformerMixin):
    def __init__(self):
        """
        perform log transformation to columns that are uni-modal and right skewed
        """
    def fit(self,X,y=None):
        return self
    
    def normaltest(self,colvec,test = 'normal'):
        """
        test if a column feature has normal distribution using the stats.mstats.normaltest, skewtest, or kurtosistest
        notably, strong multi-modal data will have a masked value returned from kurtosis test, therefore can be filtered
        after this function is called
        input: colvec, column vector, in the format of pandas series
                test: 'normal', 'skew','kurtosis'
        output: 
            for normal:
            k^2 + s^2, where k and s are the Z-score returned by the kurtosis test and the skew test
            for a perfect normal distribution, k is 3 and s is zero
            for skew or kurtosis:
            z-score 
        """
        if test == 'normal':
            return stats.mstats.normaltest(colvec[colvec.notnull()])[0]
        elif test == 'skew':
            return stats.mstats.skewtest(colvec[colvec.notnull()])[0]
        elif test == 'kurtosis':
            return stats.mstats.kurtosistest(colvec[colvec.notnull()])[0]
        else:
            print ('unknown test type')
            return
    
    def transform(self,X,y=None,thresh = 5000):
        #1. apply normal test and determine cols to transform
        test_results = X.apply(self.normaltest)
        multi_modal_cols = test_results[test_results.apply(lambda x: x is ma.masked)].index.tolist()
        to_transform_cols = test_results[test_results > thresh].index.tolist()
        #2. perform log transform
        transformed_cols = X[to_transform_cols].apply(lambda x: np.log(x) if test_results[x.name] > thresh else x)
        cols = X.columns.tolist()
        unchanged_cols = list(set(cols).difference(set(to_transform_cols)))
        return pd.merge(X[unchanged_cols],transformed_cols,right_index = True,left_index=True)

In [12]:
logtrans = LogTransform()

In [13]:
cat_num_train_shifted_log = logtrans.transform(cat_num_train_shifted)

In [14]:
cat_num_test_shifted_log = logtrans.transform(cat_num_test_shifted)

###Standardize numerical data using standardscaler, first need to impute 

In [15]:
scaler = StandardScaler()

In [16]:
cat_num_train_standard = scaler.fit_transform(cat_num_train_shifted_log.as_matrix())

In [17]:
cat_num_test_standard = scaler.fit_transform(cat_num_test_shifted_log.as_matrix())

In [20]:
cat_num_train_standard.shape

(145231L, 244L)

##Save processed data to disk 

In [18]:
with open('cat_numeric_th60_standard_train2.dat', 'wb') as cat_outfile1:
    pickle.dump(cat_num_train_standard, cat_outfile1, protocol =2)

In [19]:
with open('cat_numeric_th60_standard_test2.dat', 'wb') as cat_outfile2:
    pickle.dump(cat_num_test_standard, cat_outfile2, protocol =2)

In [20]:
with open('cat_numeric_th60_cols2.dat', 'wb') as cat_outfile3:
    pickle.dump(cat_num_train_shifted_log.columns, cat_outfile3, protocol =2)