In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm; tqdm.pandas()
pd.options.display.max_columns = 202
pd.options.display.max_rows = 300

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]

In [3]:
all_df = pd.concat([train[feature_cols], test[feature_cols]], axis=0).reset_index(drop=True)

In [4]:
ascending_cols = [
    0, 1, 2, 5, 6, 8, 18, 22, 24, 26, 32, 35, 40, 48, 49, 51, 52, 53,
    67, 70, 71, 78, 82, 89, 90, 91, 94, 95, 99, 105, 106, 110, 111, 112,
    118, 119, 125, 128, 130, 133, 134, 135, 137, 144, 145, 147, 151, 155,
    157, 162, 163, 164, 167, 170, 173, 179, 180, 184, 190, 191,195, 196, 199
]

descending_cols = [
    9, 12, 13, 20, 21, 23, 28, 31, 33, 36, 43, 56, 75, 76, 80, 81, 83,
    85, 86, 87, 88, 92, 93, 104, 107, 108, 109, 115, 116, 121, 122, 123,
    127, 131, 139, 141, 142, 146, 148, 149, 150, 154, 165, 166, 169, 172,
    174, 177, 186, 188, 192, 197, 198, 
]

In [5]:
for a in tqdm(ascending_cols):
    col = feature_cols[a]
    ranks = train[col].append(test[col]).rank(ascending=True)
    ranks = (ranks-ranks.min())/(ranks.max()-ranks.min())
    train[col] = ranks[:train.shape[0]]
    test[col] = ranks[train.shape[0]:]
    
for a in tqdm(descending_cols):
    col = feature_cols[a]
    ranks = train[col].append(test[col]).rank(ascending=False)
    ranks = (ranks-ranks.min())/(ranks.max()-ranks.min())
    train[col] = ranks[:train.shape[0]]
    test[col] = ranks[train.shape[0]:]

100%|██████████████████████████████████████████████████████████████████████████████| 63/63 [00:06<00:00, 10.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 53/53 [00:05<00:00, 10.09it/s]


In [6]:
selected_cols = ascending_cols + descending_cols
selected_cols = np.array(feature_cols)[selected_cols]

for df in [train, test]:
    df['i_am_leaking_mean'] = df[selected_cols].mean(axis=1)
    df['i_am_leaking_std'] = df[selected_cols].std(axis=1)
    df['i_am_leaking_max'] = df[selected_cols].max(axis=1)
    df['i_am_leaking_min'] = df[selected_cols].min(axis=1)

In [7]:
selected_cols = [c for c in train.columns if 'leaking' in c]
train[selected_cols].to_pickle('features/leaking_trend_train.pkl')
test[selected_cols].to_pickle('features/leaking_trend_test.pkl')
selected_cols

['i_am_leaking_mean',
 'i_am_leaking_std',
 'i_am_leaking_max',
 'i_am_leaking_min']