# Feature Combination

In [62]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
from scipy.stats import skew, boxcox
from scipy import sparse
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics, preprocessing
import time
import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [63]:
train_label = pd.read_csv('../ProjectFiles/File1.csv')
train_label_size = train_label.shape[0]
train = pd.read_csv('../ProjectFiles/File2.csv')
train_size = train.shape[0]
test = pd.read_csv('../ProjectFiles/File3.csv')
test_size = test.shape[0]

In [64]:
df_test = pd.concat([test,train], axis=1)
df_test = df_test.loc[:,~df_test.columns.duplicated()]
df_test.shape

(29231, 149)

In [65]:
df_full = pd.concat([df_test,train_label], axis=1)
df_full = df_full.loc[:,~df_full.columns.duplicated()]
df_full.shape

(29231, 150)

In [66]:
df_full.SPENDINGRESPONSE.replace(["Spend to Improve Economy","Reduce National Debt and Deficit"], [1,0], inplace=True)

In [71]:
data_types = df_full.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
con_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

id_col = 'ID'
target_col = 'SPENDINGRESPONSE'
con_cols.remove('ID')
con_cols.remove('SPENDINGRESPONSE')

print("# of Categorical features:", len(cat_cols))
print("Categorical features:", cat_cols)
print("# of Numerical features:", len(con_cols))
print("Numerical features:", con_cols)
print("ID: %s, target: %s" %( id_col, target_col))

# of Categorical features: 24
Categorical features: ['State', 'f1', 'f3', 'f12', 'f13', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f108', 'f110', 'f114', 'f115', 'f118', 'f119', 'f120', 'f121', 'f122', 'f126']
# of Numerical features: 124
Numerical features: ['f2', 'f93', 'f94', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f104', 'f105', 'f106', 'f107', 'f109', 'f111', 'f112', 'f113', 'f116', 'f117', 'f123', 'f12

In [72]:
freq_encod_cat = []
for i in cat_cols:
    if df_full[i].value_counts().size >= 15:
        freq_encod_cat.append(i)
freq_encod_cat

['State', 'f1', 'f3', 'f108', 'f110', 'f114', 'f115', 'f121', 'f122', 'f126']

## Feature combination

### Categorical features

In [73]:
cat_cols = [x for x in cat_cols if x not in freq_encod_cat]

In [74]:
import itertools
for comb in itertools.combinations(cat_cols, 2):
    feat = comb[0] + "_" + comb[1]
    df_full[feat] = df_full[comb[0]] + df_full[comb[1]]
    print('Combining Columns:', feat)

Combining Columns: f12_f13
Combining Columns: f12_f95
Combining Columns: f12_f96
Combining Columns: f12_f97
Combining Columns: f12_f98
Combining Columns: f12_f99
Combining Columns: f12_f100
Combining Columns: f12_f101
Combining Columns: f12_f102
Combining Columns: f12_f103
Combining Columns: f12_f118
Combining Columns: f12_f119
Combining Columns: f12_f120
Combining Columns: f13_f95
Combining Columns: f13_f96
Combining Columns: f13_f97
Combining Columns: f13_f98
Combining Columns: f13_f99
Combining Columns: f13_f100
Combining Columns: f13_f101
Combining Columns: f13_f102
Combining Columns: f13_f103
Combining Columns: f13_f118
Combining Columns: f13_f119
Combining Columns: f13_f120
Combining Columns: f95_f96
Combining Columns: f95_f97
Combining Columns: f95_f98
Combining Columns: f95_f99
Combining Columns: f95_f100
Combining Columns: f95_f101
Combining Columns: f95_f102
Combining Columns: f95_f103
Combining Columns: f95_f118
Combining Columns: f95_f119
Combining Columns: f95_f120
Combini

In [75]:
df_full.head()

Unnamed: 0,ID,State,f1,f2,f3,f4,f5,f6,f7,f8,...,f102_f103,f102_f118,f102_f119,f102_f120,f103_f118,f103_f119,f103_f120,f118_f119,f118_f120,f119_f120
0,3094,AK,AK01,69,E,61.0,55.33333,66.0,52.33333,49.25,...,AA,AG,,,AG,,,,,
1,13856,AK,AK01,59,M,61.0,55.33333,66.0,52.33333,49.25,...,BA,BG,BR,BP,AG,AR,AP,GR,GP,RP
2,16213,AK,AK01,63,D,61.0,55.33333,66.0,52.33333,49.25,...,AJ,AG,,AP,JG,,JP,,GP,
3,17196,AK,AK01,55,D,61.0,55.33333,66.0,52.33333,49.25,...,AJ,AG,AN,AP,JG,JN,JP,GN,GP,NP
4,17762,AK,AK01,75,D,61.0,55.33333,66.0,52.33333,49.25,...,AJ,AG,AN,AC,JG,JN,JC,GN,GC,NC


In [76]:
data_types_2 = df_full.dtypes  
cat_cols_2 = list(data_types_2[data_types_2=='object'].index)

In [77]:
print(cat_cols_2, len(cat_cols_2))

['State', 'f1', 'f3', 'f12', 'f13', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f108', 'f110', 'f114', 'f115', 'f118', 'f119', 'f120', 'f121', 'f122', 'f126', 'f12_f13', 'f12_f95', 'f12_f96', 'f12_f97', 'f12_f98', 'f12_f99', 'f12_f100', 'f12_f101', 'f12_f102', 'f12_f103', 'f12_f118', 'f12_f119', 'f12_f120', 'f13_f95', 'f13_f96', 'f13_f97', 'f13_f98', 'f13_f99', 'f13_f100', 'f13_f101', 'f13_f102', 'f13_f103', 'f13_f118', 'f13_f119', 'f13_f120', 'f95_f96', 'f95_f97', 'f95_f98', 'f95_f99', 'f95_f100', 'f95_f101', 'f95_f102', 'f95_f103', 'f95_f118', 'f95_f119', 'f95_f120', 'f96_f97', 'f96_f98', 'f96_f99', 'f96_f100', 'f96_f101', 'f96_f102', 'f96_f103', 'f96_f118', 'f96_f119', 'f96_f120', 'f97_f98', 'f97_f99', 'f97_f100', 'f97_f101', 'f97_f102', 'f97_f103', 'f97_f118', 'f97_f119', 'f97_f120', 'f98_f99', 'f98_f100', 'f98_f101', 'f98_f102', 'f98_f103', 'f98_f118', 'f98_f119', 'f98_f120', 'f99_f100', 'f99_f101', 'f99_f102', 'f99_f103', 'f99_f118', 'f99_f119', 'f99_f120'

In [78]:
cat_cols_2 = [x for x in cat_cols_2 if x not in freq_encod_cat]
cat_cols_2 = [x for x in cat_cols_2 if x not in cat_cols]

In [79]:
print(len(cat_cols_2))

91


In [80]:
LBL = preprocessing.LabelEncoder()
start = time.time()
LE_map = dict()
for cat_col in cat_cols_2:
    df_full[cat_col] = LBL.fit_transform(df_full[cat_col].astype(str))
    LE_map[cat_col]=dict(zip(LBL.classes_, LBL.transform(LBL.classes_)))
print ('Label enconding finished in %f seconds' % (time.time()-start))

Label enconding finished in 2.045571 seconds


In [81]:
OHE = preprocessing.OneHotEncoder(sparse=False)
start = time.time()
OHE.fit(df_full[cat_cols_2])
OHE_data = OHE.transform(df_full[cat_cols_2])
print ('One-hot-encoding finished in %f seconds' % (time.time()-start))

OHE_vars = [var + '_' + str(level).replace(' ','_')\
                for var in cat_cols_2 for level in LE_map[var]]

print (OHE_data.shape)

One-hot-encoding finished in 2.152405 seconds
(29231, 3967)


There is no obvious relationship among numerical features. And the feature importance from XGBoost shows that numerical features dominant in the model. The feature combination here will just limited to categorical features. 

In [88]:
df_full = pd.concat((df_full.ID, pd.DataFrame(OHE_data,columns=OHE_vars)), axis = 1)

In [91]:
df_full.to_csv('../data/input_cat_comb1.csv', index=False)