In [5]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

def denoise(df):
    df['D_63'] = df['D_63'].apply(lambda t: {'CR':0, 'XZ':1, 'XM':2, 'CO':3, 'CL':4, 'XL':5}[t]).astype(np.int8)
    df['D_64'] = df['D_64'].apply(lambda t: {np.nan:-1, 'O':0, '-1':1, 'R':2, 'U':3}[t]).astype(np.int8)
    for col in tqdm(df.columns):
        if col not in ['customer_ID','S_2','D_63','D_64']:
            df[col] = np.floor(df[col]*100)
    return df

test = pd.read_csv('test_data.csv', nrows=1000)
test = denoise(test)
test.to_feather('/workspaces/opt/test.feather')

train = pd.read_csv('train_data.csv', nrows=1000)
train = denoise(train)
train.to_feather('/workspaces/opt/train.feather')

100%|██████████| 190/190 [00:00<00:00, 9076.82it/s]
100%|██████████| 190/190 [00:00<00:00, 8863.01it/s]


In [7]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc,os,random
import time,datetime
from tqdm import tqdm
from multiprocessing import Pool as ThreadPool


def one_hot_encoding(df,cols,is_drop=True):
    for col in cols:
        print('one hot encoding:',col)
        dummies = pd.get_dummies(pd.Series(df[col]),prefix='oneHot_%s'%col)
        df = pd.concat([df,dummies],axis=1)
    if is_drop:
        df.drop(cols,axis=1,inplace=True)
    return df

def cat_feature(df):
    one_hot_features = [col for col in df.columns if 'oneHot' in col]
    if lastk is None:
        num_agg_df = df.groupby("customer_ID",sort=False)[one_hot_features].agg(['mean', 'std', 'sum', 'last'])
    else:
        num_agg_df = df.groupby("customer_ID",sort=False)[one_hot_features].agg(['mean', 'std', 'sum'])
    num_agg_df.columns = ['_'.join(x) for x in num_agg_df.columns]

    if lastk is None:
        cat_agg_df = df.groupby("customer_ID",sort=False)[cat_features].agg(['last', 'nunique'])
    else:
        cat_agg_df = df.groupby("customer_ID",sort=False)[cat_features].agg(['nunique'])
    cat_agg_df.columns = ['_'.join(x) for x in cat_agg_df.columns]

    count_agg_df = df.groupby("customer_ID",sort=False)[['S_2']].agg(['count'])
    count_agg_df.columns = ['_'.join(x) for x in count_agg_df.columns]
    df = pd.concat([num_agg_df, cat_agg_df,count_agg_df], axis=1).reset_index()
    print('cat feature shape after engineering', df.shape )

    return df

def num_feature(df):
    if num_features[0][:5] == 'rank_':
        num_agg_df = df.groupby("customer_ID",sort=False)[num_features].agg(['last'])
    else:
        if lastk is None:
            num_agg_df = df.groupby("customer_ID",sort=False)[num_features].agg(['mean', 'std', 'min', 'max', 'sum', 'last'])
        else:
            num_agg_df = df.groupby("customer_ID",sort=False)[num_features].agg(['mean', 'std', 'min', 'max', 'sum'])
    num_agg_df.columns = ['_'.join(x) for x in num_agg_df.columns]
    if num_features[0][:5] != 'rank_':
        for col in num_agg_df.columns:
            num_agg_df[col] = num_agg_df[col] // 0.01
    df = num_agg_df.reset_index()
    print('num feature shape after engineering', df.shape )

    return df

def diff_feature(df):
    diff_num_features = [f'diff_{col}' for col in num_features]
    cids = df['customer_ID'].values
    df = df.groupby('customer_ID')[num_features].diff().add_prefix('diff_')
    df.insert(0,'customer_ID',cids)
    if lastk is None:
        num_agg_df = df.groupby("customer_ID",sort=False)[diff_num_features].agg(['mean', 'std', 'min', 'max', 'sum', 'last'])
    else:
        num_agg_df = df.groupby("customer_ID",sort=False)[diff_num_features].agg(['mean', 'std', 'min', 'max', 'sum'])
    num_agg_df.columns = ['_'.join(x) for x in num_agg_df.columns]
    for col in num_agg_df.columns:
        num_agg_df[col] = num_agg_df[col] // 0.01

    df = num_agg_df.reset_index()
    print('diff feature shape after engineering', df.shape )

    return df

n_cpu = 16
transform = [['','rank_','ym_rank_'],[''],['']]

for li, lastk in enumerate([None,3,6]):
    for prefix in transform[li]:
        train_df = pd.read_feather('/workspaces/opt/train.feather')
        test_df = pd.read_feather('/workspaces/opt/test.feather')
        df = pd.concat([train_df, test_df], ignore_index=True)
        all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
        cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
        num_features = [col for col in all_cols if col not in cat_features]
        for col in [col for col in df.columns if 'S_' in col or 'P_' in col]:
            if col != 'S_2':
                df[col] = df[col].fillna(0)

        if lastk is not None:
            prefix = f'last{lastk}_' + prefix
            print('all df shape',df.shape)
            df['rank'] = df.groupby('customer_ID')['S_2'].rank(ascending=False)
            df = df.loc[df['rank']<=lastk].reset_index(drop=True)
            df = df.drop(['rank'],axis=1)
            print(f'last {lastk} shape',df.shape)

        if prefix == 'rank_':
            cids = df['customer_ID'].values
            df = df.groupby('customer_ID')[num_features].rank(pct=True).add_prefix('rank_')
            df.insert(0,'customer_ID',cids)
            num_features = [f'rank_{col}' for col in num_features]

        if prefix == 'ym_rank_':
            cids = df['customer_ID'].values
            df['ym'] = df['S_2'].apply(lambda x:x[:7])
            df = df.groupby('ym')[num_features].rank(pct=True).add_prefix('ym_rank_')
            num_features = [f'ym_rank_{col}' for col in num_features]
            df.insert(0,'customer_ID',cids)

        if prefix in ['','last3_']:
            df = one_hot_encoding(df,cat_features,False)

        vc = df['customer_ID'].value_counts(sort=False).cumsum()
        batch_size = int(np.ceil(len(vc) / n_cpu))
        dfs = []
        start = 0
        for i in range(min(n_cpu,int(np.ceil(len(vc) / batch_size)))):
            vc_ = vc[i*batch_size:(i+1)*batch_size]
            dfs.append(df[start:vc_[-1]])
            start = vc_[-1]

        pool = ThreadPool(n_cpu)

        if prefix in ['','last3_']:
            cat_feature_df = pd.concat(pool.map(cat_feature,tqdm(dfs,desc='cat_feature'))).reset_index(drop=True)

            cat_feature_df.to_feather(f'/workspaces/opt/{prefix}cat_feature.feather')

        if prefix in ['','last3_','last6_','rank_','ym_rank_']:
            num_feature_df = pd.concat(pool.map(num_feature,tqdm(dfs,desc='num_feature'))).reset_index(drop=True)
            num_feature_df.to_feather(f'/workspaces/opt/{prefix}num_feature.feather')

        if prefix in ['','last3_']:
            diff_feature_df = pd.concat(pool.map(diff_feature,tqdm(dfs,desc='diff_feature'))).reset_index(drop=True)
            diff_feature_df.to_feather(f'/workspaces/opt/{prefix}diff_feature.feather')

        pool.close()

one hot encoding: B_30
one hot encoding: B_38
one hot encoding: D_114
one hot encoding: D_116
one hot encoding: D_117
one hot encoding: D_120
one hot encoding: D_126
one hot encoding: D_63
one hot encoding: D_64
one hot encoding: D_66
one hot encoding: D_68


cat_feature:   0%|          | 0/15 [00:00<?, ?it/s]

cat feature shape after engineeringcat feature shape after engineering cat feature shape after engineering(11, 196)
cat feature shape after engineering cat feature shape after engineering cat feature shape after engineering(11, 196) 
(11, 196)

cat_feature:  93%|█████████▎| 14/15 [00:00<00:00, 135.84it/s]

 
(11, 196)(11, 196) 

cat_feature: 100%|██████████| 15/15 [00:00<00:00, 120.13it/s]

(11, 196)cat feature shape after engineering

cat feature shape after engineering 
(11, 196) 
(11, 196)cat feature shape after engineering
cat feature shape after engineering  (11, 196)(11, 196)

cat feature shape after engineering (11, 196)
cat feature shape after engineeringcat feature shape after engineering  (11, 196)




(11, 196)
cat feature shape after engineering
 cat feature shape after engineering(11, 196) 
(8, 196)


num_feature: 100%|██████████| 15/15 [00:00<00:00, 212.74it/s]


num feature shape after engineeringnum feature shape after engineering  (11, 1063)(11, 1063)

num feature shape after engineeringnum feature shape after engineering (11, 1063)
num feature shape after engineering  (11, 1063)
num feature shape after engineeringnum feature shape after engineering (11, 1063)num feature shape after engineering 
(11, 1063)(11, 1063) 
(11, 1063)
num feature shape after engineering
num feature shape after engineeringnum feature shape after engineering  (11, 1063)(11, 1063) 
num feature shape after engineeringnum feature shape after engineering
(11, 1063)  
num feature shape after engineering(8, 1063)(11, 1063)
 
(11, 1063)
num feature shape after engineering (11, 1063)


diff_feature: 100%|██████████| 15/15 [00:00<00:00, 299.30it/s]


diff feature shape after engineeringdiff feature shape after engineering (11, 1063) (11, 1063)
diff feature shape after engineering
 (11, 1063)diff feature shape after engineering
 (11, 1063)
diff feature shape after engineering diff feature shape after engineering (11, 1063)diff feature shape after engineering(11, 1063) (8, 1063)


diff feature shape after engineering (11, 1063)
diff feature shape after engineering (11, 1063)
diff feature shape after engineering (11, 1063)
diff feature shape after engineering (11, 1063)
diff feature shape after engineering (11, 1063)
diff feature shape after engineering (11, 1063)
diff feature shape after engineering (11, 1063)
diff feature shape after engineering (11, 1063)


num_feature: 100%|██████████| 15/15 [00:00<00:00, 297.78it/s]

num feature shape after engineering (11, 178)num feature shape after engineeringnum feature shape after engineering
  num feature shape after engineeringnum feature shape after engineeringnum feature shape after engineeringnum feature shape after engineering(11, 178)  num feature shape after engineering  (11, 178)
num feature shape after engineering(11, 178)num feature shape after engineeringnum feature shape after engineering (11, 178)num feature shape after engineering
  (11, 178) (11, 178)
 (11, 178)
num feature shape after engineering(11, 178)
(11, 178)(11, 178)

(11, 178)



 num feature shape after engineering(11, 178) 
(11, 178)num feature shape after engineering
 (8, 178)



num_feature: 100%|██████████| 15/15 [00:00<00:00, 300.69it/s]


num feature shape after engineering (11, 1063)
num feature shape after engineering (11, 1063)
num feature shape after engineering (11, 1063)num feature shape after engineering
num feature shape after engineeringnum feature shape after engineering num feature shape after engineering (11, 1063)num feature shape after engineering (11, 1063)(11, 1063)
num feature shape after engineering  
 num feature shape after engineering(11, 1063)

 (11, 1063)num feature shape after engineering(11, 1063)
 
num feature shape after engineering(11, 1063)
 (11, 1063)(11, 1063)num feature shape after engineering
num feature shape after engineering 
 (8, 1063)(11, 1063)

num feature shape after engineering (11, 1063)
all df shape (2000, 190)
last 3 shape (481, 190)
one hot encoding: B_30
one hot encoding: B_38
one hot encoding: D_114
one hot encoding: D_116
one hot encoding: D_117
one hot encoding: D_120
one hot encoding: D_126
one hot encoding: D_63
one hot encoding: D_64
one hot encoding: D_66
one hot enco

cat_feature: 100%|██████████| 15/15 [00:00<00:00, 357.60it/s]

cat feature shape after engineeringcat feature shape after engineeringcat feature shape after engineeringcat feature shape after engineeringcat feature shape after engineering cat feature shape after engineeringcat feature shape after engineeringcat feature shape after engineering   cat feature shape after engineering (11, 133)  cat feature shape after engineering(11, 133)
(11, 133)
cat feature shape after engineering (11, 133)  
cat feature shape after engineering(11, 133)
(11, 133)(11, 133)(11, 133)
cat feature shape after engineering(11, 133)
(11, 133) cat feature shape after engineering
(11, 133) 

 (11, 133)

(11, 133) (11, 133)


cat feature shape after engineering (8, 133)



num_feature: 100%|██████████| 15/15 [00:00<00:00, 448.36it/s]


num feature shape after engineeringnum feature shape after engineering num feature shape after engineering (11, 886)
 (11, 886)
(11, 886)
num feature shape after engineeringnum feature shape after engineeringnum feature shape after engineering  num feature shape after engineeringnum feature shape after engineering (11, 886)num feature shape after engineering
 (11, 886)(11, 886)
num feature shape after engineering
 num feature shape after engineering(11, 886)   (11, 886)(11, 886)
num feature shape after engineering(11, 886)


(11, 886) num feature shape after engineering(8, 886)
 
(11, 886)
num feature shape after engineering (11, 886)
num feature shape after engineering (11, 886)


diff_feature: 100%|██████████| 15/15 [00:00<00:00, 404.31it/s]

diff feature shape after engineering diff feature shape after engineeringdiff feature shape after engineering  (11, 886)(11, 886)





(11, 886)

diff feature shape after engineering (11, 886)
diff feature shape after engineeringdiff feature shape after engineering (8, 886) 
diff feature shape after engineering (11, 886)diff feature shape after engineering
(11, 886)diff feature shape after engineering 
(11, 886)
 diff feature shape after engineering(11, 886) 
diff feature shape after engineering(11, 886) (11, 886)

diff feature shape after engineering (11, 886)
diff feature shape after engineeringdiff feature shape after engineering (11, 886) (11, 886)
diff feature shape after engineering
 (11, 886)
all df shape (2000, 190)
last 6 shape (951, 190)


num_feature: 100%|██████████| 15/15 [00:00<00:00, 230.59it/s]


num feature shape after engineering (11, 886)
num feature shape after engineering num feature shape after engineeringnum feature shape after engineering  num feature shape after engineering(11, 886) (11, 886)
num feature shape after engineeringnum feature shape after engineering(11, 886)

 (11, 886)num feature shape after engineering
num feature shape after engineering  (11, 886)(11, 886)(11, 886)

num feature shape after engineering 
 (11, 886)(11, 886)num feature shape after engineeringnum feature shape after engineering
 
 num feature shape after engineeringnum feature shape after engineering(11, 886)(11, 886)
  
(8, 886)num feature shape after engineering(11, 886)

 (11, 886)


In [17]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc,os,random
import time,datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

from utils import *
root = args.root
seed = args.seed


train = pd.read_feather(f'/workspaces/opt/train.feather')
test = pd.read_feather(f'/workspaces/opt/test.feather')

def one_hot_encoding(df,cols,is_drop=True):
    for col in cols:
        print('one hot encoding:',col)
        dummies = pd.get_dummies(pd.Series(df[col]),prefix='oneHot_%s'%col)
        df = pd.concat([df,dummies],axis=1)
    if is_drop:
        df.drop(cols,axis=1,inplace=True)
    return df
cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
eps = 1e-3


train_y =  pd.read_csv(f'{root}/train_labels.csv')
train = train.merge(train_y,how='left',on=id_name)

print(train.shape,test.shape)

lgb_config = {
    'lgb_params':{
                  'objective' : 'binary',
                  'metric' : 'binary_logloss',
                  'boosting': 'dart',
                  'max_depth' : -1,
                  'num_leaves' : 64,
                  'learning_rate' : 0.035,
                  'bagging_freq': 5,
                  'bagging_fraction' : 0.7,
                  'feature_fraction' : 0.7,
                  'min_data_in_leaf': 256,
                  'max_bin': 63,
                  'min_data_in_bin': 256,
                  # 'min_sum_heassian_in_leaf': 10,
                  'tree_learner': 'serial',
                  'boost_from_average': 'false',
                  'lambda_l1' : 0.1,
                  'lambda_l2' : 30,
                  'num_threads': 24,
                  'verbosity' : 1,
    },
    'feature_name':[col for col in train.columns if col not in [id_name,label_name,'S_2']],
    'rounds':4500,
    'early_stopping_rounds':100,
    'verbose_eval':50,
    'folds':5,
    'seed':seed
}


Lgb_train_and_predict(train,test,lgb_config,gkf=True,aug=None,run_id='LGB_with_series_feature')

NameError: name 'args' is not defined