In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from tqdm import tqdm
from collections import defaultdict

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]

In [3]:
from sklearn.mixture import GaussianMixture

'''
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y
'''

def augment(x,y,t=9):
    xs,xn = [],[]
    feat_len = x.shape[1]//2
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(feat_len):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
            x1[:,c+feat_len] = x1[ids][:,c+feat_len]
        xs.append(x1)

    for i in range(t//9):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(feat_len):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
            x1[:,c+feat_len] = x1[ids][:,c+feat_len]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

def transform_freq_feature(df1,df2,df3_base,feat):
    val1=df1[feat].values
    val2=df3_base[feat].values
    
    defa1=defaultdict(lambda:0)
    
    for val in val1:
        defa1[str(val)]+=1.
    for val in val2:
        defa1[str(val)]+=1.  
        
    df1[feat +"_freq"]= df1[feat].apply(lambda x :defa1[str(x)] ) 
    df2[feat+"_freq"]= df2[feat].apply(lambda x :defa1[str(x)] )  
    
def transform_gmm_feature(df1,df2,df3_base,feat):
    vals = df1[feat].append(df3_base[feat]).values
    gm = GaussianMixture(n_components=2)
    gm.fit(vals.reshape((-1,1)))
        
    df1[feat +"_gmm_prob"] = gm.predict_proba(df1[[feat]].values)[:,0]
    df2[feat+"_gmm_prob"]= gm.predict_proba(df2[[feat]].values)[:,0]
    
def load_data():
    train_df = train[feature_cols].copy()
    test_df = test[feature_cols].copy()
    real_test_df = test[feature_cols].copy()

    unique_samples = []
    unique_count = np.zeros_like(test_df)
    for feature in tqdm(range(test_df.shape[1])):
        _, index_, count_ = np.unique(test_df.values[:, feature], return_counts=True, return_index=True)
        unique_count[index_[count_ == 1], feature] += 1
    
    # Samples which have unique values are real the others are fake
    real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
    synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
    
    real_test_df=real_test_df.iloc[real_samples_indexes]
    print(real_test_df.shape[0])
    print(len(synthetic_samples_indexes))
    
    columns=train_df.columns.values
    for col in tqdm(columns):
        transform_freq_feature(train_df,test_df,real_test_df,col)
    
    '''
    for f in tqdm(feature_cols[:200]): # transform into probability
        train_df[f] = 1/(1+np.exp(-train_df[f]))
        test_df[f] = 1/(1+np.exp(-test_df[f]))
    '''    
    from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
    
    '''
    for f in tqdm(feature_cols): # process for non-unique
        qt = MinMaxScaler(feature_range=(-3, 3)) #QuantileTransformer(output_distribution='normal')
        qt.fit(train_df[f].append(test_df[f]).values.reshape((-1,1)))
        train_val = qt.transform(train_df[[f]].values).reshape((-1,))
        test_val = qt.transform(test_df[[f]].values).reshape((-1,))
        
        train_df[f+'_exp'] = np.where(train_df[f+'_freq']>1, 2, 1)**train_val
        test_df[f+'_exp'] = np.where(test_df[f+'_freq']>1, 2, 1)**test_val
        
        #train_df[f] = np.where(train_df[f+'_freq']>1, train_df[f], np.nan) 
        #test_df[f] = np.where(test_df[f+'_freq']>1, test_df[f], np.nan)
    '''
    train_df = pd.concat([train_df, pd.read_pickle('features/magic_tuned_train')], axis=1)
    test_df = pd.concat([test_df, pd.read_pickle('features/magic_tuned_test')], axis=1)
    
    '''
    train_df = pd.concat([train_df,
                          pd.read_pickle('features/poisson_prob_train.pkl')], axis=1)
    test_df = pd.concat([test_df,
                        pd.read_pickle('features/poisson_prob_test.pkl')], axis=1)
    
    for col in tqdm(columns[[6,110, 26, 146, 139, 21, 76, 174, 133, 99, 198, 109, 80, 13, 190, 148, 0, 44, 164]]):
        transform_gmm_feature(train_df,test_df,real_test_df,col)
    ''' 
    print(train_df.isnull().sum().sum(), test_df.isnull().sum().sum())
    return train_df.fillna(-999), test_df.fillna(-999), real_samples_indexes

In [4]:
train_df, test_df, real_samples_indexes = load_data()
train_df.columns.tolist()

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:06<00:00, 32.75it/s]


100000
100000


100%|████████████████████████████████████████████████████████████████████████████| 200/200 [02:28<00:00,  1.10s/it]


0 0


['var_0',
 'var_1',
 'var_2',
 'var_3',
 'var_4',
 'var_5',
 'var_6',
 'var_7',
 'var_8',
 'var_9',
 'var_10',
 'var_11',
 'var_12',
 'var_13',
 'var_14',
 'var_15',
 'var_16',
 'var_17',
 'var_18',
 'var_19',
 'var_20',
 'var_21',
 'var_22',
 'var_23',
 'var_24',
 'var_25',
 'var_26',
 'var_27',
 'var_28',
 'var_29',
 'var_30',
 'var_31',
 'var_32',
 'var_33',
 'var_34',
 'var_35',
 'var_36',
 'var_37',
 'var_38',
 'var_39',
 'var_40',
 'var_41',
 'var_42',
 'var_43',
 'var_44',
 'var_45',
 'var_46',
 'var_47',
 'var_48',
 'var_49',
 'var_50',
 'var_51',
 'var_52',
 'var_53',
 'var_54',
 'var_55',
 'var_56',
 'var_57',
 'var_58',
 'var_59',
 'var_60',
 'var_61',
 'var_62',
 'var_63',
 'var_64',
 'var_65',
 'var_66',
 'var_67',
 'var_68',
 'var_69',
 'var_70',
 'var_71',
 'var_72',
 'var_73',
 'var_74',
 'var_75',
 'var_76',
 'var_77',
 'var_78',
 'var_79',
 'var_80',
 'var_81',
 'var_82',
 'var_83',
 'var_84',
 'var_85',
 'var_86',
 'var_87',
 'var_88',
 'var_89',
 'var_90',
 'var_91'

In [5]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import time

version = 'kh_lgb_4fold_testing'

params = {'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.03,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.7289,
         'reg_lambda': 4.984,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01,
         'min_child_weight': 19.428,
         'num_threads': 2}

oof = np.zeros(len(train))
prediction = np.zeros(len(test))

n_fold = 4
folds = KFold(n_splits=n_fold, shuffle=True, random_state=0)
feature_importance_df = pd.DataFrame()

for fold_n, (train_index, valid_index) in enumerate(folds.split(train.target.values,train.target.values)):
    
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = train_df.loc[train_index].values, train_df.loc[valid_index].values
    y_train, y_valid = train.target.values[train_index], train.target.values[valid_index]
    #X_train, y_train = augment(X_train, y_train)
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data, num_boost_round=4000 ,
                    valid_sets = [train_data, valid_data], verbose_eval=100,
                    early_stopping_rounds = 200)
     
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = train_df.columns
    fold_importance_df["importance"] = model.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_n + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)    
       
    oof[valid_index] = model.predict(X_valid, num_iteration=model.best_iteration)
    prediction += model.predict(test_df.values, num_iteration=model.best_iteration)/n_fold
    gc.collect()
    print(roc_auc_score(y_valid, oof[valid_index]))
    
full_auc = roc_auc_score(train.target.values, oof)
print(full_auc)
# baseline: raw features 10 fold: cv: 0.8984, lb 0.899
# kh_lgb_10fold_target_encode_ver1: cv: 0.8996, lb 0.898
# kh_lgb_10fold_all_item_target_encode_ver1 (min samples=100) (pure new feat): cv: 0.9933 lb: 0.539
# kh_lgb_10fold_all_item_target_encode_ver2 (min samples=500) ==> no help, cut off
# kh_lgb_10fold_all_item_vc_encode_ver1 ==> no help, cut off
# kh_lgb_10fold_per_day_vc_encode_ver1 (vc + le): cv: 0.9108 lb: 
# kh_lgb_10fold_per_day_vc_encode_ver2 (cumulative vc + le): cv: 0.8988 lb: 
# kh_lgb_10fold_per_day_vc_encode_ver2 (vc + pop feat): cv: 0.9108 lb: 0.900
# kh_lgb_10fold_future_appear_cnt_v1: cv: 0.8981, lb:
# kh_lgb_10fold_cum_encode_v1: cv: 0.995, lb: 0.693
# kh_lgb_10fold_target_encode_ver2 (top 51 + features): cv: 0.8996, lb 0.0.897
# kh_lgb_10fold_target_encode_by_vc_ver1: cv: 0.9071, lb:
# kh_lgb_10fold_has_pair_ver1: cv: lb: 
# kh_lgb_10fold_pos_sum_magic_ver2: cv: 0.8982, lb:
# kh_lgb_10fold_cumulative_vc_v1: cv: 0.8982
# kh_lgb_10fold_by_date_freq_cnt: cv: 0.8983
# kh_lgb_10fold_sdae_v1: cv: no help
# kh_lgb_10fold_all_items_opposite_rank_count_encode_v1:
# kh_lgb_10fold_per_feature_opposite_rank_count_encode_v1: cv: 0.9039, lb: 0.900
# kh_lgb_10fold_gp_time_feats_v1: cv: 0.8977694185263893
# kh_lgb_10fold_leaking_trend_v1: cv: no help
# new no fake (n: 9, p: 1): cv: 0.91099
# kh_lgb_10fold_per_feat_opposite_rank_count_diff_v2: no help
# kh_lgb_10fold_target_corr_sum_v1: no help
# kh_lgb_10fold_freq_aug_v1 (n: 9, p: 2): cv: 0.9153283467600679, lb: 0.916
# kh_lgb_10fold_freq_aug_v2 (n: 9, p: 3): cv: 0.9155296211906148, lb: 
# kh_lgb_10fold_freq_aug_v3 (n: 9, p: 9): cv: 0.9162549832155663, lb: 0.917
# kh_lgb_10fold_freq_aug_v4 (n: 18, p: 18), nround=5000: fold 1: 0.92163[4000], 0.923613[5000]: cv: lb: 
# kh_lgb_10fold_freq_aug_v5 (n: 9, p: 9), nround=5000: fold 1: 0.9220[4000], 0.923529[5000], cv: lb:
# kh_lgb_10fold_freq_aug_v5 (n: 9, p: 12), nround=4000: cv: lb:
# original 4 fold's 1st fold: 0.908x
# marcus minmax 1st fold: 0.917765
# marcus minmax, realtest only 1st fold: 0.917532
# clip_freq* prob (1/(1+exp(-x)), 1st fold: 0.913211
# freq** quantile(x), 1st fold: 0.913417
# clip_freq** quantile(x), 1st fold: 0.917749
# mark unique as np.nan: nothinng
# marcus minmax, top 50 exp feats only, 1st fold:  0.911331
# predict unique vals: 0.909005
# nn per column oof raw, 1st fold: 0.908399
# magic untune 1st fold: 0.917456
# magic tuned 1st fold: 0.91634 (worse)

Fold 0 started at Mon Apr  8 19:47:18 2019
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.839297	valid_1's auc: 0.824869
[200]	training's auc: 0.872465	valid_1's auc: 0.85341
[300]	training's auc: 0.889264	valid_1's auc: 0.868275
[400]	training's auc: 0.901068	valid_1's auc: 0.878591
[500]	training's auc: 0.909811	valid_1's auc: 0.885862
[600]	training's auc: 0.916242	valid_1's auc: 0.891443
[700]	training's auc: 0.921588	valid_1's auc: 0.895391
[800]	training's auc: 0.925786	valid_1's auc: 0.898593
[900]	training's auc: 0.929179	valid_1's auc: 0.901113
[1000]	training's auc: 0.932161	valid_1's auc: 0.90335
[1100]	training's auc: 0.934712	valid_1's auc: 0.905116
[1200]	training's auc: 0.93704	valid_1's auc: 0.906651
[1300]	training's auc: 0.939065	valid_1's auc: 0.907955
[1400]	training's auc: 0.940892	valid_1's auc: 0.909261
[1500]	training's auc: 0.942483	valid_1's auc: 0.910342
[1600]	training's auc: 0.943945	valid_1's auc: 0.911243
[1700]	tra

KeyboardInterrupt: 

In [None]:
pd.to_pickle(oof, 'oof+submission/'+version+'_oof_train')
pd.to_pickle(prediction, 'oof+submission/'+version+'_oof_test')    
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = prediction
sub.to_csv('oof+submission/' + version + '_' + str(full_auc).replace('.', '_') + ".csv", index=False)

In [None]:
pd.options.display.max_rows=600
mean_gain = feature_importance_df[['importance', 'Feature']].groupby('Feature').mean().sort_values('importance', ascending=False)
mean_gain

In [None]:
feats = [f.replace('_exp', '') for f in mean_gain.index if 'exp' in f]
feats