In [1]:
import numpy as np; np.random.seed(0)
import tensorflow as tf; tf.set_random_seed(seed=0)
import pandas as pd
import lightgbm as lgb
import gc
from collections import defaultdict
import matplotlib
from tqdm import tqdm

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]
target = train.target.values
#feature_cols = feature_cols[:2]

In [3]:
def transform_freq_feature(df1,df2,df3_base,feat):
    val1=df1[feat].values
    val2=df3_base[feat].values
    
    defa1=defaultdict(lambda:0)
    
    for val in val1:
        defa1[str(val)]+=1.
    for val in val2:
        defa1[str(val)]+=1.  
        
    df1[feat +"_freq"]= df1[feat].apply(lambda x :defa1[str(x)] ) 
    df2[feat+"_freq"]= df2[feat].apply(lambda x :defa1[str(x)] )  
    
def load_data():
    train_df = train[feature_cols].copy()
    test_df = test[feature_cols].copy()
    real_test_df = test[feature_cols].copy()

    unique_samples = []
    unique_count = np.zeros_like(test_df)
    for feature in tqdm(range(test_df.shape[1])):
        _, index_, count_ = np.unique(test_df.values[:, feature], return_counts=True, return_index=True)
        unique_count[index_[count_ == 1], feature] += 1
    
    # Samples which have unique values are real the others are fake
    real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
    synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
    
    real_test_df=real_test_df.iloc[real_samples_indexes]
    print(real_test_df.shape[0])
    print(len(synthetic_samples_indexes))
    
    columns=train_df.columns.values
    for col in tqdm(columns):
        transform_freq_feature(train_df,test_df,real_test_df,col)
     
    return train_df.fillna(-1), test_df.fillna(-1), real_samples_indexes 

train_df, test_df, real_samples_indexes = load_data()

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:05<00:00, 33.82it/s]


100000
100000


100%|████████████████████████████████████████████████████████████████████████████| 200/200 [02:10<00:00,  1.00it/s]


In [4]:
from sklearn.preprocessing import StandardScaler

train_df = pd.concat([train_df, pd.read_pickle('features/magic_train')], axis=1)
test_df = pd.concat([test_df, pd.read_pickle('features/magic_test')], axis=1)
print('add magic done')

std = StandardScaler()
std.fit(train_df.append(test_df.loc[real_samples_indexes,:]).values)
train_df.loc[:] = std.transform(train_df.values)
test_df.loc[:] = std.transform(test_df.values)
print('feature normalization done')

new_feat_cols = train_df.columns.tolist()
for f_i, f in enumerate(tqdm(new_feat_cols)):
    for i in range(3):
        train_df[f+'_filt_{}'.format(i)] = 0
        test_df[f+'_filt_{}'.format(i)] = 0
print('new column creation done')
print(train_df.shape[1])

train_vals = train_df.values
test_vals = test_df.values

def get_updated_vals(vals):
    counter = len(new_feat_cols)
    for ix1, f in enumerate(tqdm(new_feat_cols)):
        filts = [vals[:,ix1]<-0.05, (vals[:,ix1]>=-0.1) & (vals[:,ix1]<0), (vals[:,ix1]>=0) & (vals[:,ix1]<0.1)]
        #filts = [vals[:,ix1]<0]
        for i, filt in enumerate(filts):
            new_vals = np.zeros((vals.shape[0],))
            new_vals[filt] = vals[filt, ix1]
            vals[filt, ix1] = 0
            vals[:, counter] = new_vals
            counter += 1
    print(counter)
    return vals

train_vals = get_updated_vals(train_vals)
test_vals = get_updated_vals(test_vals)

print(train_vals.shape)
print(train_vals[:,-10:])
train_df = pd.DataFrame(columns=train_df.columns, data=train_vals)
test_df = pd.DataFrame(columns=test_df.columns, data=test_vals)

add magic done
feature normalization done


100%|████████████████████████████████████████████████████████████████████████████| 600/600 [00:52<00:00,  4.29it/s]


new column creation done
2400


100%|███████████████████████████████████████████████████████████████████████████| 600/600 [00:05<00:00, 118.23it/s]


2400


100%|███████████████████████████████████████████████████████████████████████████| 600/600 [00:05<00:00, 118.44it/s]


2400
(200000, 2400)
[[ 0.         -0.55467426  0.         ...  0.          0.
   0.        ]
 [ 0.         -0.37630699  0.         ...  0.          0.
   0.        ]
 [ 0.         -0.750562    0.         ...  0.          0.
   0.        ]
 ...
 [ 0.         -0.43745682  0.         ... -0.34104018  0.
   0.        ]
 [ 0.          0.          0.         ... -0.34104018  0.
   0.        ]
 [ 0.         -0.79801733  0.         ... -0.34104018  0.
   0.        ]]


In [5]:
train_vals[:30,-2:]

array([[0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.05420939],
       [0.        , 0.        ],
       [0.        , 0.        ]])

In [6]:
train_df.shape

(200000, 2400)

In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

version = 'lr_v11'
oof = np.zeros(len(train_df))
prediction = np.zeros(len(test_df))

n_fold = 4
folds = KFold(n_splits=n_fold, shuffle=True, random_state=0)

for fold_n, (train_index, valid_index) in enumerate(folds.split(train.target.values,train.target.values)):
    
    print('Fold', fold_n)
    X_train, X_valid = train_df.loc[train_index].values, train_df.loc[valid_index].values
    y_train, y_valid = train.target.values[train_index], train.target.values[valid_index]
    
    lr = LogisticRegression(solver='liblinear')
    lr.fit(X_train, y_train)
    
    oof[valid_index] = lr.predict_proba(X_valid)[:,1]
    prediction += lr.predict_proba(test_df.values)[:,1]/n_fold
    print(roc_auc_score(y_valid, oof[valid_index]))
    
full_auc = roc_auc_score(train.target.values, oof)
print(full_auc)

# v1: standardize original columns + untuned magic: aucs: [0.8839, 0.8891, 0.8893, 0.8883] overall aucs: 0.8876
# v2: no standardize original columns + untuned magic: aucs: [0.8816, 0.8877, 0.8879, 0.8872] overall aucs: 0.8860
# v3: standardize real only original columns + untuned magic: aucs: [0.8839, 0.8892, 0.8894, 0.8883] overall aucs: 0.8876
# v4: standardize original+freq columns + untuned magic: aucs: [0.8845, 0.8899, 0.8899, 0.8888] overall aucs: 0.8882
# v5: v4 + separate pos\neg: aucs: [0.8920, 0.8965, 0.8975, 0.8968] overall aucs: 0.8956
# v6: v5 + separate pos\neg freq: aucs: [0.8928, 0.8978, 0.8982, 0.8974] overall aucs: 0.8965
# v7: v6 + standardize magic: aucs: [0.8932, 0.8980, 0.8985, 0.8977] overall aucs: 0.8968
# v8: v7 bug fixed: aucs: [0.8967, 0.9006, 0.9022, 0.9003] overall aucs: 0.8999
# v9: v8 + separate pos\neg exp: aucs: [0.8975, 0.9019, 0.9039, 0.9018] overall aucs: 0.9013
# v10: v9 + l1 penalty: worse then v9
# v11: v9 + separate to using thresholds -0.05, 0, 0.05: aucs: [0.8975, 0.9019, 0.9039, 0.9018] overall aucs: 0.90156

Fold 0
0.8977139290781707
Fold 1
0.9025292158073883
Fold 2
0.9039871714337062
Fold 3
0.9023099443348579
0.9015973628587355


In [8]:
oof.mean(), prediction.mean()

(0.10065855459578511, 0.09170037567599487)

In [9]:
pd.to_pickle(oof, 'oof+submission/'+version+'_oof_train')
pd.to_pickle(prediction, 'oof+submission/'+version+'_oof_test')    
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = prediction
sub.to_csv('oof+submission/' + version + '_' + str(full_auc).replace('.', '_') + ".csv", index=False)