In [26]:
import os
import sys
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics, ensemble
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

## Make use of the leaky data
Rather than just drop them

Leak data from: 

https://www.kaggle.com/johnfarrell/giba-s-property-extended-extended-result

Reference:

https://www.kaggle.com/johnfarrell/breaking-lb-fresh-start-with-lag-selection

https://www.kaggle.com/johnfarrell/baseline-with-lag-select-fake-rows-dropped

https://www.kaggle.com/prashantkikani/santad-label-is-present-in-row

In [33]:
data_dir = 'data/'
print('Data Files are: ', os.listdir(data_dir))
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
print('Train DataFrame Shape: ', train.shape)
print('Test DataFrame Shape: ', test.shape)
train.head()

Data Files are:  ['sample_submission.csv.zip', 'test.csv', 'test.csv.zip', 'train.csv', 'train.csv.zip']
Train DataFrame Shape:  (4459, 4993)
Test DataFrame Shape:  (49342, 4992)


Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [5]:
leak_cols =  ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f', 'fb0f5dbfe', '58e056e12',
              '20aa07010', '024c577b9', 'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', '62e59a501', 
              '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', 
              '1931ccfdd', '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', '6619d81fc', '1db387535', 
              'fc99f9426', '91f701ba2', '0572565c2', '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98']

#### Get Leak Data and Compile

In [21]:
# from: https://www.kaggle.com/dfrumkin/a-simple-way-to-use-giba-s-features-v2
def fast_get_leak(df, cols, lag=0):
    d1 = df[cols[:-lag-2]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2 = df[cols[lag+2:]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['pred'] = df[cols[lag]]
    d3 = d2[~d2.duplicated(['key'], keep=False)]
    return d1.merge(d3, how='left', on='key').pred.fillna(0)


def get_leaks(df, cols=None, lags=0):
    df = df.copy()
    for i in tqdm(range(lags)):
        df['leaked_target_'+str(i)] = fast_get_leak(df, cols, i)
    return df

In [27]:
test['target'] = train['target'].mean()
df_ = pd.concat([train[['ID', 'target'] + leak_cols], test[['ID', 'target']+ leak_cols]]).reset_index(drop=True)

lags = 25
df_ = get_leaks(df_, leak_cols, lags)
df_.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [01:05<00:00,  2.60s/it]


Unnamed: 0,ID,target,f190486d6,58e2e02e6,eeb9cd3aa,9fd594eec,6eef030c1,15ace8c9f,fb0f5dbfe,58e056e12,...,leaked_target_15,leaked_target_16,leaked_target_17,leaked_target_18,leaked_target_19,leaked_target_20,leaked_target_21,leaked_target_22,leaked_target_23,leaked_target_24
0,000d6aaf2,38000000.0,1866666.66,12066666.66,700000.0,600000.0,900000.0,4100000.0,0.0,0.0,...,0.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,0.0,0.0,0.0
1,000fbd867,600000.0,0.0,2850000.0,2225000.0,1800000.0,800000.0,0.0,0.0,3300000.0,...,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0027d6b71,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0028cbf45,2000000.0,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002a68644,14400000.0,0.0,0.0,0.0,0.0,37662000.0,0.0,4000000.0,6700000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
leak_targets = ['leaked_target_'+str(i) for i in range(lags)]
train = train.join(df_.set_index('ID')[leak_targets], on='ID', how='left')
test = test.join(df_.set_index('ID')[leak_targets], on='ID', how='left')
train[['target'] + leak_targets].head()

Unnamed: 0,target,leaked_target_0,leaked_target_1,leaked_target_2,leaked_target_3,leaked_target_4,leaked_target_5,leaked_target_6,leaked_target_7,leaked_target_8,...,leaked_target_15,leaked_target_16,leaked_target_17,leaked_target_18,leaked_target_19,leaked_target_20,leaked_target_21,leaked_target_22,leaked_target_23,leaked_target_24
0,38000000.0,38000000.0,38000000.0,38000000.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,...,0.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,0.0,0.0,0.0
1,600000.0,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,600000.0,...,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# Compiled all leak targets together
train['leak_target'] = 0
test['leak_target'] = 0
for i in tqdm(range(lags)):
    train.loc[train['leak_target'] == 0, 'leak_target'] = train.loc[train['leak_target'] == 0, leak_targets[i]]
    test.loc[test['leak_target'] == 0, 'leak_target'] = test.loc[test['leak_target'] == 0, leak_targets[i]]
    
print('train, test, leak targets number is: ', sum(train['leak_target'] > 0), sum(test['leak_target'] > 0))
print('Accuracy of train leak is :', sum(train['leak_target'] == train['target']) / sum(train['leak_target'] > 0))

100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:02<00:00,  8.82it/s]


train, test, leak targets number is:  3577 7662
Accuracy of train leak is : 0.977914453452614


### Combine Leak Data and Prediction together for testing

In [47]:
pred_target = pd.read_csv('submission.csv')
pred_target.head()

Unnamed: 0,ID,target
0,000137c73,1985677.0
1,00021489f,1985677.0
2,0004d7953,3277046.0
3,00056a333,7564810.0
4,00056d8eb,1985677.0


In [51]:
test['target'] = pred_target['target']
test.loc[test['leak_target'] != 0, 'target'] = test.loc[test['leak_target'] != 0, 'leak_target']
test[['ID', 'target', 'leak_target']].head()

Unnamed: 0,ID,target,leak_target
0,000137c73,1985677.0,0.0
1,00021489f,1985677.0,0.0
2,0004d7953,3277046.0,0.0
3,00056a333,7564810.0,0.0
4,00056d8eb,1985677.0,0.0


In [53]:
test[['ID', 'target']].to_csv('submission_leak.csv', index=False)