In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm; tqdm.pandas()
pd.options.display.max_columns = 202
pd.options.display.max_rows = 300

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]

In [3]:
all_df = pd.concat([train[feature_cols], test[feature_cols]], axis=0).reset_index(drop=True)

In [4]:
cumulative_vc = all_df[feature_cols[0]].value_counts()
cumulative_vc = cumulative_vc[cumulative_vc.index>0]

for col in tqdm(feature_cols[1:]):
    pos_filt, neg_filt = all_df[col]>0, all_df[col]<0
    vc = all_df[col].value_counts()
    
    all_df.loc[pos_filt, col+'_pos_pos_diff'] = all_df.loc[pos_filt, col].map(cumulative_vc).fillna(0) - \
                                                all_df.loc[pos_filt, col].map(vc)
    
    all_df.loc[neg_filt, col+'_pos_neg_diff'] = (all_df.loc[neg_filt, col]*-1).map(cumulative_vc).fillna(0) - \
                                                all_df.loc[neg_filt, col].map(vc)
     
    pos_idx, neg_idx = vc.index[vc.index>0], vc.index[vc.index<0]
    
    comm_idx = np.intersect1d(pos_idx, cumulative_vc.index)
    new_idx = pos_idx[~pos_idx.isin(comm_idx)]
    cumulative_vc.loc[comm_idx] += vc.loc[comm_idx]
    cumulative_vc = cumulative_vc.append(vc.loc[new_idx])   
    
    comm_idx = np.intersect1d(-1*neg_idx, cumulative_vc.index)
    new_idx = neg_idx[~neg_idx.isin(-1*comm_idx)]
    cumulative_vc.loc[comm_idx] -= vc.loc[-1*comm_idx].values
    new_vc = pd.Series(index=new_idx*-1, data=vc.loc[new_idx].values*-1)
    cumulative_vc = cumulative_vc.append(new_vc)

100%|████████████████████████████████████████████████████████████████████████████| 199/199 [09:24<00:00,  4.12s/it]


In [5]:
feat_cols = [c for c in all_df.columns if '_diff' in c]
all_df[feat_cols].nunique()

var_1_pos_pos_diff       36
var_1_pos_neg_diff       38
var_2_pos_pos_diff       41
var_2_pos_neg_diff        0
var_3_pos_pos_diff       57
var_3_pos_neg_diff        2
var_4_pos_pos_diff       62
var_4_pos_neg_diff        0
var_5_pos_pos_diff       67
var_5_pos_neg_diff       70
var_6_pos_pos_diff       79
var_6_pos_neg_diff        0
var_7_pos_pos_diff       68
var_7_pos_neg_diff        0
var_8_pos_pos_diff       80
var_8_pos_neg_diff       80
var_9_pos_pos_diff       80
var_9_pos_neg_diff        0
var_10_pos_pos_diff      95
var_10_pos_neg_diff      95
var_11_pos_pos_diff      95
var_11_pos_neg_diff      97
var_12_pos_pos_diff     149
var_12_pos_neg_diff       0
var_13_pos_pos_diff     170
var_13_pos_neg_diff      59
var_14_pos_pos_diff     112
var_14_pos_neg_diff       0
var_15_pos_pos_diff     183
var_15_pos_neg_diff       0
var_16_pos_pos_diff     183
var_16_pos_neg_diff       0
var_17_pos_pos_diff     130
var_17_pos_neg_diff     195
var_18_pos_pos_diff     198
var_18_pos_neg_diff 

In [7]:
unique_cols = [c for c in all_df.columns[1:] if '_diff' not in c and all_df[c+'_pos_neg_diff'].nunique()==0]
for col in unique_cols:
    vc = all_df[col].value_counts()
    print(col, vc[vc.index<0])

var_2 Series([], Name: var_2, dtype: int64)
var_4 Series([], Name: var_4, dtype: int64)
var_6 Series([], Name: var_6, dtype: int64)
var_7 Series([], Name: var_7, dtype: int64)
var_9 Series([], Name: var_9, dtype: int64)
var_12 Series([], Name: var_12, dtype: int64)
var_14 Series([], Name: var_14, dtype: int64)
var_15 Series([], Name: var_15, dtype: int64)
var_16 Series([], Name: var_16, dtype: int64)
var_23 Series([], Name: var_23, dtype: int64)
var_25 Series([], Name: var_25, dtype: int64)
var_28 Series([], Name: var_28, dtype: int64)
var_31 Series([], Name: var_31, dtype: int64)
var_33 Series([], Name: var_33, dtype: int64)
var_34 Series([], Name: var_34, dtype: int64)
var_42 Series([], Name: var_42, dtype: int64)
var_43 Series([], Name: var_43, dtype: int64)
var_46 Series([], Name: var_46, dtype: int64)
var_50 Series([], Name: var_50, dtype: int64)
var_53 Series([], Name: var_53, dtype: int64)
var_56 Series([], Name: var_56, dtype: int64)
var_57 Series([], Name: var_57, dtype: int64

In [8]:
selected_feats = [c for c in all_df.columns if '_diff' in c and all_df[c].nunique()!=0]
len(selected_feats)

331

In [10]:
all_df[selected_feats].iloc[:train.shape[0]].to_pickle('features/buy_sell_feat_train.pkl')
all_df[selected_feats].iloc[train.shape[0]:].reset_index(drop=True).to_pickle('features/buy_sell_feat_test.pkl')