# feature selection & modelling 

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import featuretools as ft




In [2]:
print(pd.__version__)
print(np.__version__)
print(ft.__version__)

1.1.5
1.19.2
0.23.1


In [72]:
df_target_merged = pd.read_csv('./dataset/target_merged.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [73]:
df_user_feature = pd.read_csv('./dataset/user_feature_total_remove_highcorr.csv')
df_merchant_feature = pd.read_csv('./dataset/merchant_feature_total_remove_highcorr.csv')
df_coupon_feature = pd.read_csv('./dataset/coupon_feature_total_remove_highcorr.csv')

In [74]:
df_user_feature = df_user_feature.add_prefix('user_')
df_merchant_feature = df_merchant_feature.add_prefix('merchant_')
df_coupon_feature = df_coupon_feature.add_prefix('coupon_')

## feature seletion 
To reduce modelling time and avoid the curse of dimensionality, feature selection is necessary for our project.

Traditional Methods of feature selection:

1. remove features with high missing rate 
2. remove features with singal value 
3. remove high correlated features 
4. univariate statistical test based 
5. model based

In [94]:
print('# user feature {}'.format(df_user_feature.shape[1]-2))
print('# merchant feature {}'.format(df_merchant_feature.shape[1]-2))
print('# coupon feature {}'.format(df_coupon_feature.shape[1]-2))

# user feature 73
# merchant feature 60
# coupon feature 39


In [95]:
df_target_merged[df_target_merged.data == 'Train']

Unnamed: 0,User_id,Merchant_id,Coupon_id,Date_received,Date,datediff,target,Month_received,data
0,1439408,4663,11002.0,2016-05-28,,,0.0,2016-05-01,Train
1,1439408,2632,8591.0,2016-02-17,,,0.0,2016-02-01,Train
2,1439408,2632,1078.0,2016-03-19,,,0.0,2016-03-01,Train
3,1439408,2632,8591.0,2016-06-13,,,0.0,2016-06-01,Train
4,1439408,2632,8591.0,2016-05-16,2016-06-13,28.0,-1.0,2016-05-01,Train
...,...,...,...,...,...,...,...,...,...
1053277,212662,3021,3739.0,2016-05-04,2016-05-08,4.0,1.0,2016-05-01,Train
1053278,212662,2934,5686.0,2016-03-21,2016-03-22,1.0,1.0,2016-03-01,Train
1053279,212662,3021,3739.0,2016-05-08,2016-06-02,25.0,-1.0,2016-05-01,Train
1053280,752472,7113,1633.0,2016-06-13,,,0.0,2016-06-01,Train


In [96]:
def cast_categorical_variable(df):
    cate_feat = list(
    filter(lambda x: 'MODE' in x , 
           df.columns)
    )
    df[cate_feat] = df[cate_feat].astype(object)
    return df

In [97]:
df_user_feature = cast_categorical_variable(df_user_feature)
df_merchant_feature = cast_categorical_variable(df_merchant_feature)
df_coupon_feature = cast_categorical_variable(df_coupon_feature)

In [98]:
df_train_target = df_target_merged[(df_target_merged['data'] == 'Train') & 
                                  (df_target_merged['target'] >= 0)][['User_id', 'Merchant_id',
                                     'Coupon_id', 'datediff','target','Month_received']]

In [140]:
df_train_target

Unnamed: 0,User_id,Merchant_id,Coupon_id,datediff,target,Month_received
0,1439408,4663,11002.0,,0.0,2016-05-01
1,1439408,2632,8591.0,,0.0,2016-02-01
2,1439408,2632,1078.0,,0.0,2016-03-01
3,1439408,2632,8591.0,,0.0,2016-06-01
5,1832624,3381,7610.0,,0.0,2016-04-01
...,...,...,...,...,...,...
1053276,212662,3532,5267.0,,0.0,2016-03-01
1053277,212662,3021,3739.0,4.0,1.0,2016-05-01
1053278,212662,2934,5686.0,1.0,1.0,2016-03-01
1053280,752472,7113,1633.0,,0.0,2016-06-01


### statistical based feature selection
Now, we have about 180 variables with some null values and categorical variables. In this part, we will use WoE(Weight of Evidence) transformation to save us from encoding and imputing, which is widely used in research credit risk world.

blog: https://multithreaded.stitchfix.com/blog/2015/08/13/weight-of-evidence/

Steps:
1. to split (a continuous) variable into few categories or to group (a discrete) variable into few categories (and in both cases you assume that all observations in one category have "same" effect on dependent variable)
2. to calculate WoE value for each category (then the original x values are replaced by the WoE values)

Prons: (from the blog)
1. Seamlessly compare the strength of continuous and categorical variables without creating dummy variables.
2. Seamlessly handle missing values without imputation.
3. Assess the predictive power of missing values.

In this part, we use Python Package toad for WoE transformation and IV calculating.

Doc: https://toad.readthedocs.io/en/stable/

github: https://github.com/amphibian-dev/toad

In [99]:
import toad

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

In [100]:
def woeTransform(df):
    ### binning data use chi2 binning algorithm, the minimum threshold in each bin is 2.5%
    ### return a combiner 
    c = toad.transform.Combiner() 
    df_binned = c.fit_transform(df, y = 'target', method = 'chi', min_samples = 0.025, n_bins=10) 
    woe_trans = toad.transform.WOETransformer()
    df_woe = woe_trans.fit_transform(df_binned,df_binned['target'],exclude=['target'])
    return c,woe_trans, df_woe

In [101]:
df_user = df_train_target.merge(right=df_user_feature, how='left', 
                                left_on= ['User_id','Month_received'],
                               right_on=['user_User_id','user_time'],
                                )
df_merchant_user = df_user.merge(right=df_merchant_feature, how='left',
                                left_on=['Merchant_id','Month_received'],
                               right_on=['merchant_Merchant_id','merchant_time'],
                                )
df_total = df_merchant_user.merge(right=df_coupon_feature, how='left',
                                left_on=['Coupon_id','Month_received'],
                               right_on=['coupon_Coupon_id','coupon_time'],
                                )

In [102]:
del df_user
del df_merchant_user

In [103]:
df_total

Unnamed: 0,User_id,Merchant_id,Coupon_id,datediff,target,Month_received,user_User_id,user_time,"user_AVG_TIME_BETWEEN(offline.Date, unit=days)","user_AVG_TIME_BETWEEN(offline.Date_received, unit=days)",...,coupon_NUM_UNIQUE(offline.WEEKDAY(Date)),coupon_NUM_UNIQUE(offline.WEEKDAY(Date_received)),coupon_NUM_UNIQUE(offline.WEEKDAY(Date_received) WHERE use_coupon = 1),coupon_SUM(offline.Distance WHERE use_coupon = 1),coupon_SUM(offline.Distance WHERE discount_type = 0.0),coupon_SUM(offline.datediff WHERE discount_type = 0.0),coupon_SUM(offline.promotion_amonut WHERE use_coupon = 1),coupon_SUM(offline.promotion_condition WHERE use_coupon = 1),coupon_SUM(offline.purchase WHERE discount_type = 1.0),coupon_SUM(offline.purchase WHERE discount_type = 0.0)
0,1439408,4663,11002.0,,0.0,2016-05-01,1439408.0,2016-05-01,,31.0,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1439408,2632,8591.0,,0.0,2016-02-01,,,,,...,,,,,,,,,,
2,1439408,2632,1078.0,,0.0,2016-03-01,,,,,...,,,,,,,,,,
3,1439408,2632,8591.0,,0.0,2016-06-01,1439408.0,2016-06-01,,70.0,...,2.0,7.0,3.0,3.0,0.0,0.0,3.0,60.0,3.0,0.0
4,1832624,3381,7610.0,,0.0,2016-04-01,1832624.0,2016-04-01,,,...,4.0,6.0,4.0,2.0,0.0,0.0,160.0,1600.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042290,212662,3532,5267.0,,0.0,2016-03-01,,,,,...,,,,,,,,,,
1042291,212662,3021,3739.0,4.0,1.0,2016-05-01,212662.0,2016-05-01,5.5,0.5,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1042292,212662,2934,5686.0,1.0,1.0,2016-03-01,,,,,...,,,,,,,,,,
1042293,752472,7113,1633.0,,0.0,2016-06-01,752472.0,2016-06-01,,,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
df_train_target.shape

(1042295, 6)

In [105]:
features_list = list(df_user_feature.columns)[2:]+list(df_merchant_feature.columns)[2:]+list(df_coupon_feature.columns)[2:]

In [106]:
df_woe = df_total[features_list + ['target']]

In [108]:
#Conduct preliminary feature selection according to missing percentage, IV and correlation (with other features)
train_selected, dropped = toad.selection.select(df_woe,target = 'target',
                                                empty = 0.95, iv = 0.01, corr = 0.8,
                                                return_drop=True)
print(dropped)
print(train_selected.shape)

{'empty': array(['user_MEAN(online.uniform_discount_rate)',
       'user_MODE(online.Discount_rate WHERE Action = 2)',
       'user_NUM_UNIQUE(online.Merchant_id WHERE Action = 2)',
       'user_NUM_UNIQUE(online.WEEKDAY(Date_received) WHERE Action = 2)'],
      dtype='<U63'), 'iv': array(['user_MEAN(online.promotion_amonut)',
       'user_MEAN(online.promotion_condition)',
       'user_MODE(online.Discount_rate)',
       'user_MEAN(online.use_coupon WHERE Action = 1)',
       'user_MODE(online.WEEKDAY(Date) WHERE Action = 1)',
       'user_MODE(online.WEEKDAY(Date_received))',
       'user_NUM_UNIQUE(online.Coupon_id WHERE Action = 1)',
       'user_NUM_UNIQUE(online.Merchant_id WHERE Action = 1)',
       'user_NUM_UNIQUE(online.WEEKDAY(Date) WHERE Action = 1)',
       'user_NUM_UNIQUE(online.WEEKDAY(Date_received) WHERE Action = 1)',
       'merchant_MIN(merchant_offline.Distance WHERE discount_type = 1.0)'],
      dtype=object), 'corr': array(['merchant_SUM(merchant_offline.Distance

In [None]:
combiner,woe_transformer,transformed = woeTransform(train_selected)

In [None]:
transformed.to_csv('./dataset/woe_feature_train.csv',index=False)

### MODEL based selection

Use Transformed feature for LogisticRegression, Random Forest, XGBoost

In [117]:
X = transformed.drop('target',axis=1)
y = transformed['target']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 1024,test_size=0.3)

In [118]:
print(X_train.shape)
print(X_test.shape)
print('Positive rate in train set:{:.2%}'.format(y_train.mean()))
print('Positive rate in validation set:{:.2%}'.format(y_test.mean()))

(729606, 116)
(312689, 116)
Positive rate in train set:6.19%
Positive rate in validation set:6.15%


#### LogisticRegression

In [119]:
selector = SelectFromModel(estimator=
                           LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=5000)
                          ,threshold='mean').fit(
    StandardScaler().fit_transform(X_train), y_train)

In [120]:
lr_features = set(X_train.columns[selector.get_support()])

In [121]:
len(lr_features)

39

#### Random Forest

In [122]:
rf = RandomForestClassifier(n_estimators=200, verbose=1, n_jobs=-1)

In [123]:
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.6min finished


RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=1)

In [124]:
rf_features = set(X_train.columns[rf.feature_importances_ > np.mean(rf.feature_importances_)])

In [125]:
len(rf_features)

50

#### XGBoost

In [126]:
xgb = XGBClassifier(n_estimators=200, max_depth=3,n_jobs=-1)

In [127]:
xgb.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=-1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [128]:
xgb_features = set(X_train.columns[xgb.feature_importances_ >  np.mean(xgb.feature_importances_)])

In [132]:
(lr_features.union(rf_features.union(xgb_features))).difference(set(features_list))

set()

In [81]:
result_features = list(lr_features.union(rf_features.union(xgb_features)))

In [48]:
from collections import defaultdict
selected_count = defaultdict(int)

In [49]:
for f in lr_features:
    selected_count[f]+=1
for f in rf_features:
    selected_count[f]+=1
for f in xgb_features:
    selected_count[f]+=1

In [141]:
df_target_total = df_target_merged[['User_id', 'Merchant_id',
                                     'Coupon_id', 'datediff','target','Month_received','data']]

In [142]:
df_target_total

Unnamed: 0,User_id,Merchant_id,Coupon_id,datediff,target,Month_received,data
0,1439408,4663,11002.0,,0.0,2016-05-01,Train
1,1439408,2632,8591.0,,0.0,2016-02-01,Train
2,1439408,2632,1078.0,,0.0,2016-03-01,Train
3,1439408,2632,8591.0,,0.0,2016-06-01,Train
4,1439408,2632,8591.0,28.0,-1.0,2016-05-01,Train
...,...,...,...,...,...,...,...
1166917,5828093,5717,10418.0,,,2016-07-01,Test
1166918,6626813,1699,7595.0,,,2016-07-01,Test
1166919,6626813,7321,7590.0,,,2016-07-01,Test
1166920,4547069,760,13602.0,,,2016-07-01,Test


In [143]:
df_user = df_target_total.merge(right=df_user_feature, how='left', 
                                left_on= ['User_id','Month_received'],
                               right_on=['user_User_id','user_time'],
                                )
df_merchant_user = df_user.merge(right=df_merchant_feature, how='left',
                                left_on=['Merchant_id','Month_received'],
                               right_on=['merchant_Merchant_id','merchant_time'],
                                )
df_total = df_merchant_user.merge(right=df_coupon_feature, how='left',
                                left_on=['Coupon_id','Month_received'],
                               right_on=['coupon_Coupon_id','coupon_time'],
                                )

In [144]:
df_feature_selected = df_total[['User_id', 'Merchant_id','Coupon_id', 
                                'datediff','target','Month_received','data'] 
                               + result_features]

In [92]:
df_feature_selected.to_csv('feature_selected_train&test.csv',index=False)

In [145]:
df_total_woe = woe_transformer.transform(df_total[features_list])

  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  mask &= (ar1 != a)
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X == value[i]] = woe[i]
  res[X ==