###One-hot Encoding Selected Categorial Features
This notebook is to perform one-hot encoding for those categorical features selected using xgboost (xgb) and random forest (rf). Different combinations of the features selected by these two methods are also considered, including 
rf_or_xgb: all the features selected by rf or xgb.
rf_and_xgb: the features selected by both rf and xgb.
rf: the features selected by rf.
xgb: the features selected by xgb. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
#import seaborn as sns
%matplotlib inline
import requests
#from pattern import web
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn import cross_validation
import gc
from sklearn import metrics
from scipy import stats
from sklearn.base import TransformerMixin
from datetime import datetime as dt
from math import isnan
from numpy import ma
from scipy.sparse import hstack,csr_matrix
import pickle

In [2]:
cat_cols_rf = pickle.load(open("cat_cols_selected_rf.p","rb"))

In [3]:
cat_cols_xgb = pickle.load(open("cat_cols_selected_xgb.p","rb"))

In [4]:
na_values = ['[]','',-1]
train = pd.read_csv('./train.csv',na_values = na_values)

  data = self._reader.read(nrows)


In [5]:
test = pd.read_csv('./test.csv',na_values = na_values)

  data = self._reader.read(nrows)


In [6]:
xtrain = train.drop(['target','ID'],axis = 1)
indices_train = xtrain.index #to be used later for separating train and test data parts

In [7]:
xtest = test.drop(['ID'],axis = 1)
#shift the index of test data before concat
indices_test = xtest.index
indices_test = indices_test + xtrain.shape[0] 
xtest.index = indices_test

In [8]:
#release memory
%xdel train
%xdel test

In [9]:
cat_cols_rf = cat_cols_rf.tolist()

In [10]:
cat_cols_xgb = cat_cols_xgb.tolist()

In [11]:
cat_cols_rf_plus_xgb = list(set(cat_cols_rf + cat_cols_xgb))

In [12]:
cat_cols_rf_plus_xgb

['VAR_0005',
 'VAR_0237',
 'VAR_0232',
 'VAR_1934',
 'VAR_0342',
 'VAR_0274',
 'VAR_0467',
 'VAR_0354',
 'VAR_0352',
 'VAR_0353']

In [13]:
xtrain.loc[:,cat_cols_rf_plus_xgb].head()

Unnamed: 0,VAR_0005,VAR_0237,VAR_0232,VAR_1934,VAR_0342,VAR_0274,VAR_0467,VAR_0354,VAR_0352,VAR_0353
0,C,FL,True,IAPS,CF,FL,,O,O,U
1,B,CA,False,IAPS,EC,MI,Discharged,R,O,R
2,C,WV,True,IAPS,UU,WV,,,R,R
3,C,TX,False,RCC,,TX,,,R,R
4,N,IL,True,BRANCH,,IL,,O,R,U


In [14]:
cxtotal = pd.concat([xtrain.loc[:,cat_cols_rf_plus_xgb], xtest.loc[:,cat_cols_rf_plus_xgb]])
indices_total = cxtotal.index

In [15]:
%xdel xtest
%xdel xtrain
print (cxtotal.shape)

(290463, 10)


In [16]:
class CatVectorize(TransformerMixin):
    def __init__(self):
        """
        wrapper to use DictVectorizer to transform obj-type categorical features
        """
    def fit(self,X,y=None):
        return
    
    def cat_vectorize(self,X):
        """
        vectorize the categorical features of the input dataframe
        take the two columns VAR_0001, and VAR_0005 for example,
            cat_vectorize(cxtrain,obj_cols[:2])
            for each feature, each value is used to construct a binary feature, since there are 7 values in total
            for the two features combined, the transformed matrix has seven columns, corresponding column values are
            stored in encoder.feature_names

        Notably, NA is encoded as a value by filling NAs with 'NA' first

        inputs: df, dataframe; cols, list of columns to be transformed
        output: data_mt, encoded sparse matrix, can be converted to array with "toarray" method
                encoder.feature_names, explained above
        """
        data = X.fillna('NA')
        datadict = data.T.to_dict().values()
        encoder = DictVectorizer()
        data_mt = encoder.fit_transform(datadict)
        return encoder.feature_names_,data_mt
    
    def transform(self,X,y=None,thresh=None):
        """
        inputs
            thresh: is the maximum num of values a column can have to be qualified for the transformation
            columns that have more than thresh number of values are likely states, or description that may need NLP
            X: df of categorical features
        outputs:
            X_vect, the sparse matrix contains only the converted features
            other_cols, list of the other obj-type columns that have more than thresh number of values
            feature_names, the corresponding feature values used as columns in X_vect
        """
        if thresh:
            value_counts = X.apply(lambda x: x.nunique(dropna = False))
            obj_cols = value_counts[value_counts < thresh].index.tolist()
            other_cols = list(set(X.columns).difference(set(obj_cols)))
            X_to_trans = X[obj_cols]
        else:
            X_to_trans = X.copy()
        feature_names,X_vect = self.cat_vectorize(X_to_trans)   
        return X_vect,feature_names,other_cols

In [17]:
vect = CatVectorize()

In [18]:
obj_sparse_or,feature_names_or,other_cols_or = vect.transform(cxtotal,thresh = 60)

In [19]:
obj_sparse_or.shape

(290463, 181)

In [20]:
obj_sparse_rf,feature_names_rf,other_cols_rf = vect.transform(cxtotal.loc[:,cat_cols_rf],thresh = 60)

In [21]:
obj_sparse_rf.shape

(290463, 27)

In [22]:
obj_sparse_xgb,feature_names_xgb,other_cols_xgb = vect.transform(cxtotal.loc[:,cat_cols_xgb],thresh = 60)

In [23]:
obj_sparse_xgb.shape

(290463, 163)

In [24]:
cat_cols_same = list(set(cat_cols_rf) & set(cat_cols_xgb))

In [25]:
cat_cols_same

['VAR_0005', 'VAR_1934']

In [26]:
obj_sparse_and,feature_names_and,other_cols_and = vect.transform(cxtotal.loc[:,cat_cols_same],thresh = 60)

In [27]:
obj_sparse_and.shape

(290463, 9)

In [28]:
obj_sparse_or_train = obj_sparse_or[:len(indices_train), :]
obj_sparse_or_test = obj_sparse_or[len(indices_train):, :]

In [29]:
obj_sparse_rf_train = obj_sparse_rf[:len(indices_train), :]
obj_sparse_rf_test = obj_sparse_rf[len(indices_train):, :]

In [30]:
obj_sparse_xgb_train = obj_sparse_xgb[:len(indices_train), :]
obj_sparse_xgb_test = obj_sparse_xgb[len(indices_train):, :]

In [31]:
obj_sparse_and_train = obj_sparse_and[:len(indices_train), :]
obj_sparse_and_test = obj_sparse_and[len(indices_train):, :]

In [32]:
with open('cat_sparse_rf_or_xgb_train2.dat', 'wb') as outfile1:
    pickle.dump(obj_sparse_or_train, outfile1, protocol =2)

In [33]:
with open('cat_sparse_rf_or_xgb_test2.dat', 'wb') as outfile2:
    pickle.dump(obj_sparse_or_test, outfile2, protocol =2)

In [34]:
with open('cat_sparse_rf_train2.dat', 'wb') as outfile3:
    pickle.dump(obj_sparse_rf_train, outfile3, protocol =2)

In [35]:
with open('cat_sparse_rf_test2.dat', 'wb') as outfile4:
    pickle.dump(obj_sparse_rf_test, outfile4, protocol =2)

In [36]:
with open('cat_sparse_xgb_train2.dat', 'wb') as outfile5:
    pickle.dump(obj_sparse_xgb_train, outfile5, protocol =2)

In [37]:
with open('cat_sparse_xgb_test2.dat', 'wb') as outfile6:
    pickle.dump(obj_sparse_xgb_test, outfile6, protocol =2)

In [38]:
with open('cat_sparse_rf_and_xgb_train2.dat', 'wb') as outfile7:
    pickle.dump(obj_sparse_and_train, outfile7, protocol =2)

In [39]:
with open('cat_sparse_rf_and_xgb_test2.dat', 'wb') as outfile8:
    pickle.dump(obj_sparse_and_test, outfile8, protocol =2)

In [40]:
cat_cols_xgb

['VAR_1934', 'VAR_0237', 'VAR_0274', 'VAR_0005', 'VAR_0342']

In [41]:
cat_cols_rf

['VAR_0467',
 'VAR_0232',
 'VAR_1934',
 'VAR_0354',
 'VAR_0352',
 'VAR_0353',
 'VAR_0005']

In [42]:
obj_sparse_or_test.shape

(145232, 181)