In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm; tqdm.pandas()
pd.options.display.max_columns = 202
pd.options.display.max_rows = 300

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]

In [3]:
train.drop(special_cols[0], axis=1, inplace=True)
test.drop(special_cols[0], axis=1, inplace=True)

unique_samples = []
unique_count = np.zeros_like(test)
for feature in tqdm(range(test.shape[1])):
    _, index_, count_ = np.unique(test.values[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

real_test=test.iloc[real_samples_indexes].reset_index(drop=True)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:06<00:00, 30.05it/s]


In [4]:
for col in tqdm(feature_cols):
    vc = train[col].append(real_test[col]).value_counts()
    vc_df = pd.DataFrame(index=vc.index)
    vc_df['count'] = vc.values
    vc_df = vc_df.sort_index(ascending=True)
    vc_df['opposite_count'] = vc_df['count'].values[-1::-1]
    vc_df['count_opposite_diff'] = vc_df['count']-vc_df['opposite_count']
    train[col+'_cnt_enc'] = train[col].map(vc_df['count_opposite_diff'])
    test[col+'_cnt_enc'] = test[col].map(vc_df['count_opposite_diff'])

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:18<00:00,  5.41it/s]


In [5]:
new_feats = [f+'_cnt_enc' for f in feature_cols]
for df in [train, test]:
    df['cnt_enc_sum'] = df[new_feats].sum(axis=1)
    df['cnt_enc_std'] = df[new_feats].std(axis=1)
new_feats += ['cnt_enc_sum', 'cnt_enc_std']

In [6]:
train[['target']+new_feats].iloc[:10000].corr('spearman').sort_values('target', ascending=False)[['target']]

Unnamed: 0,target
target,1.0
var_183_cnt_enc,0.023054
var_134_cnt_enc,0.02283
var_53_cnt_enc,0.020099
var_120_cnt_enc,0.018373
var_193_cnt_enc,0.018119
var_97_cnt_enc,0.018007
var_197_cnt_enc,0.017847
cnt_enc_sum,0.017398
var_12_cnt_enc,0.017199


In [7]:
train[new_feats].to_pickle('features/per_feature_opposite_rank_count_encode_train_v2.pkl')
test[new_feats].to_pickle('features/per_feature_opposite_rank_count_encode_test_v2.pkl')