In [3]:
!pip install lightgbm --install-option=--gpu
!pip install --upgrade category_encoders
!pip install xgboost catboost category-encoders sklearn
!pip install pytorch-lightning
!pip install einops

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score
from copy import deepcopy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.metrics import Accuracy
from pytorch_lightning import loggers as pl_loggers


  cmdoptions.check_install_build_global(options)
Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 5.9MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 79kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1
Collecting pytorch-lightning
[?25l  Downloading https://files.pythonhosted.org/packages/c2/a1/a991780873b5fd760fb99dfda01916fe9e5b186f0ba70a120e6b4f79cfaa/pytorch_lightning-1.3.1-py3-none-any.whl (805kB)
[K     |████████████████████████████████| 8

In [4]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/ 

In [5]:
!git clone https://github.com/wenkaicn/ML-000.git

seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)

loans = pd.read_csv('./ML-000/Week09/train_final.csv', engine='python')
test = pd.read_csv('./ML-000/Week09/test_final.csv', engine='python')

X_train_origin = loans.drop('loan_status', axis=1)
Y_train = loans['loan_status']
X_test_origin = test.drop('loan_status', axis=1)
Y_test = test['loan_status']

Cloning into 'ML-000'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 47 (delta 3), reused 10 (delta 1), pack-reused 35[K
Unpacking objects: 100% (47/47), done.


###数据处理

In [6]:
from copy import deepcopy
X_train = deepcopy(X_train_origin)
X_test = deepcopy(X_test_origin)
all_data = pd.concat([X_train, X_test], ignore_index=True)

# Try1
all_data['continuous_loan_income'] = all_data['continuous_installment'] / (all_data['continuous_annual_inc'] / 12)
# Try3
all_data['continuous_gap'] = all_data['continuous_funded_amnt'] - all_data['continuous_funded_amnt_inv']
# Try6
del all_data['continuous_funded_amnt']
del all_data['discrete_policy_code_1_one_hot']
del all_data['discrete_pymnt_plan_1_one_hot']
del all_data['discrete_application_type_1_one_hot']
del all_data['discrete_application_type_2_one_hot']
# Try10
all_data['open_acc_term1'] = all_data['discrete_term_1_one_hot'] * all_data['continuous_open_acc']
all_data['open_acc_term2'] = all_data['discrete_term_2_one_hot'] * all_data['continuous_open_acc']

# Try4
addr_state_features = [col for col in loans.columns if col.startswith('discrete_addr_state')]
emp_length_features = [col for col in loans.columns if col.startswith('discrete_emp_length')]
grade_features   = [col for col in loans.columns if col.startswith('discrete_grade')]
sub_grade_features = [col for col in loans.columns if col.startswith('discrete_sub_grade')]
purpose_features =  [col for col in loans.columns if col.startswith('discrete_purpose')]

home_ownership_features =  [col for col in loans.columns if col.startswith('discrete_home_ownership')]
term_features =  [col for col in loans.columns if col.startswith('discrete_term')]


new_features = {'discrete_addr_state': addr_state_features,
            'discrete_emp_length': emp_length_features,
            'discrete_grade': grade_features,
            'discrete_sub_grade': sub_grade_features,
            'discrete_purpose': purpose_features,
            'discrete_home_ownership': home_ownership_features,
            'discrete_term': term_features
            }

def one_hot_merge(row, features, encoding_map):
  for one_hot_num in features:
    if row[one_hot_num] == 1:
      return encoding_map[one_hot_num]

def count_encode(all_data, new_features):
  for key,features in new_features.items():
    encoding_map = all_data[features].sum()
    count_encoded = all_data.apply(one_hot_merge, axis=1, 
                     features=features, encoding_map=encoding_map)
    all_data[key] = count_encoded
    all_data = all_data.drop(labels=features, axis = 1)
  return all_data

all_data = count_encode(all_data, new_features)

X_train = all_data.iloc[:50000, :]
X_test = all_data.iloc[50000:, :]

In [7]:

#      结合业务逻辑填补缺失值
# 没有联合申请人，等价于联合申请人的年收入为0；
# continuous_dti的NaN是因为年收入为0造成的。先将dti改为收入/贷款，再将NaN改为0
# 原先的continuous_dti有0值，因此变为分母时加1处理
# continuous_dti_joint、continuous_loan_income按相同逻辑更改
# 剩下的空值，类似continuous_mths_since_last_delinq，是距不良记录以来的时间。将没有不良记录的统一设置为20年

all_data['continuous_annual_inc_joint'] = all_data.continuous_annual_inc_joint.fillna(0)
all_data['continuous_dti'] = 1/(all_data['continuous_dti']+1)
all_data['continuous_dti'] = all_data.continuous_dti.fillna(0)
all_data['continuous_dti_joint'] = 1/(all_data['continuous_dti_joint']+1)
all_data['continuous_dti_joint'] = all_data.continuous_dti_joint.fillna(0)
all_data['continuous_loan_income'] = 1/(all_data['continuous_loan_income']+1)
all_data = all_data.fillna(value=240)

X_train_dl = all_data.iloc[:50000, :]
X_test_dl = all_data.iloc[50000:, :]

In [8]:
# X_origin = pd.concat([X_train_origin, X_test_origin], ignore_index=True)
# Y_origin = pd.concat([Y_train, Y_test], ignore_index=True)

# for i in range(0,100000):
#   if X_origin.iloc[i,144] == 1:
#     Y_origin[i] = Y_origin[i] + 2
#     print(i)

# Y_train_t = Y_origin.iloc[:50000]
# Y_test_t = Y_origin.iloc[50000:]

In [9]:
# all_data['continuous_open_acc'] = X_origin['continuous_open_acc']
# del all_data['open_acc_term1']
# del all_data['open_acc_term2']
# X_train_t = all_data.iloc[:50000, :]
# X_test_t = all_data.iloc[50000:, :]

# 树模型


### LightGBM

In [10]:
def GridSearch(X_train, Y_train, param_grid):

  # split data for five fold
  five_fold_data = []
  for train_index, eval_index in kf.split(X_train):
      x_train, x_eval = X_train.loc[train_index], X_train.loc[eval_index]
      y_train, y_eval = Y_train.loc[train_index], Y_train.loc[eval_index]
      five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

  row_list = []
  models_list = []

  for param in list(ParameterGrid(param_grid)):
    model_list = []
    best_iterations = []
    train_errors = []
    val_errors = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[train_data,validation_data], early_stopping_rounds=200)
        train_error = bst.best_score['training']['binary_error']
        val_error  = bst.best_score['valid_1']['binary_error']

        model_list.append(bst)
        best_iterations.append(bst.best_iteration)
        train_errors.append(train_error)
        val_errors.append(val_error)

    row = dict(num_leaves = param['num_leaves'], learning_rate = param['learning_rate'], 
            val_error = np.mean(val_errors), train_error = np.mean(train_errors),
            iter_0 = best_iterations[0], iter_1 = best_iterations[1], 
            iter_2 = best_iterations[2], iter_3 = best_iterations[3], 
            iter_4 = best_iterations[4])
    row_list.append(row)
    models_list.append(model_list)

  df_cv = pd.DataFrame(row_list)

  return df_cv, models_list

def test_model(X_test, Y_test, model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)



In [11]:
param_grid = {'num_thread': [4], 'metric': ['binary_error'], 'objective': ['binary'], 'num_round': [2000],
          'num_leaves': [33], 'learning_rate': [0.03], 
          'feature_fraction': [0.8], 'bagging_fraction': [0.8]}

df_cv, models = GridSearch(X_train, Y_train, param_grid)

0-th model is training:
[1]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
Training until validation scores don't improve for 200 rounds.
[2]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[3]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[4]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063




[5]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[6]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[7]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[8]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[9]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[10]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[11]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[12]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[13]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[14]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[15]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[16]	training's binary_error: 0.203725	valid_1's binary_error: 0.2063
[17]	training's binary_error: 0.1504	valid_1's binary_error: 0.1549
[18]	training's binary_error: 0.14775	valid_1's binary_error: 0.1515
[19]	training's binary_error

In [12]:
df_cv.sort_values(['val_error'],ascending=True)

Unnamed: 0,num_leaves,learning_rate,val_error,train_error,iter_0,iter_1,iter_2,iter_3,iter_4
0,33,0.03,0.07934,0.071025,78,160,135,152,257


In [13]:
test_model(X_test, Y_test, models[0])


0.91848

In [14]:
lightgbm_clf = models[0]

In [15]:
# def GridSearch_t(X_train, Y_train, param_grid):

#   # split data for five fold
#   five_fold_data = []
#   for train_index, eval_index in kf.split(X_train):
#       x_train, x_eval = X_train.loc[train_index], X_train.loc[eval_index]
#       y_train, y_eval = Y_train.loc[train_index], Y_train.loc[eval_index]
#       five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

#   row_list = []
#   models_list = []

#   for param in list(ParameterGrid(param_grid)):
#     model_list = []
#     best_iterations = []
#     train_errors = []
#     val_errors = []
#     for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
#         print('{}-th model is training:'.format(idx))
#         train_data = lgb.Dataset(x_train, label=y_train)
#         validation_data = lgb.Dataset(x_eval, label=y_eval)
#         bst = lgb.train(param, train_data, valid_sets=[train_data,validation_data], early_stopping_rounds=200)
#         train_error = bst.best_score['training']['multi_logloss']
#         val_error  = bst.best_score['valid_1']['multi_logloss']

#         model_list.append(bst)
#         best_iterations.append(bst.best_iteration)
#         train_errors.append(train_error)
#         val_errors.append(val_error)

#     row = dict(num_leaves = param['num_leaves'], learning_rate = param['learning_rate'], 
#             val_error = np.mean(val_errors), train_error = np.mean(train_errors),
#             iter_0 = best_iterations[0], iter_1 = best_iterations[1], 
#             iter_2 = best_iterations[2], iter_3 = best_iterations[3], 
#             iter_4 = best_iterations[4])
#     row_list.append(row)
#     models_list.append(model_list)

#   df_cv = pd.DataFrame(row_list)

#   return df_cv, models_list

# def test_model_t(X_test, Y_test, model_list):
#     data = X_test
#     five_fold_pred = np.zeros((5, len(X_test)))
#     for i, bst in enumerate(model_list):
#         ypred = bst.predict(data, num_iteration=bst.best_iteration)
#         five_fold_pred[i] = ypred
#     ypred_mean = five_fold_pred.mean(axis=-2)
#     #ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
#     return ypred_mean


In [16]:
# param_grid = {'num_thread': [4], 'metric': ['multi_logloss'], 'objective': ['multiclass'], 'num_round': [2000],
#           'num_leaves': [14,16,18,30,32,33,34,35,36], 'learning_rate': [0.01,0.02,0.03], 
#           'feature_fraction': [0.8], 'bagging_fraction': [0.8], 'num_class': [4]}

# df_cv, models_t = GridSearch_t(X_train_t, Y_train_t, param_grid)

In [17]:
# df_cv.sort_values(['val_error'],ascending=True)

In [18]:
# bst = models_t[0][0]

# ypred = bst.predict(X_test_t, num_iteration=bst.best_iteration)
# predictions = []

# for x in ypred:
#     predictions.append(np.argmax(x))
# accuracy_score(predictions, Y_test_t)

### XGBoost/CatBoost

In [19]:
# 对XGBoost和CATBoost的API做包装

def BoostGridSearch(X_train, Y_train, param_grid, type="XG"):

  # split data for five fold
  five_fold_data = []
  for train_index, eval_index in kf.split(X_train):
      x_train, x_eval = X_train.loc[train_index], X_train.loc[eval_index]
      y_train, y_eval = Y_train.loc[train_index], Y_train.loc[eval_index]
      five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

  row_list = []
  models_list = []

  for param in list(ParameterGrid(param_grid)):
    model_list = []
    best_iterations = []
    train_errors = []
    val_errors = []

    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))

        if type == 'CAT':
          eval_pool = Pool(x_eval, y_eval)
          model = CatBoostClassifier(iterations=param['iterations'],
                          depth=param['depth'],
                          learning_rate=param['learning_rate'],
                          loss_function=param['loss_function'],
                          verbose=param['verbose'])

          bst = model.fit(x_train, y_train, eval_set=eval_pool, early_stopping_rounds=200)
          best_iteration = bst.get_best_iteration()
          train_error = bst.get_evals_result()['learn']['Logloss'][best_iteration]
          val_error  = bst.get_evals_result()['validation']['Logloss'][best_iteration]

        elif type == 'XG':
          train_data = xgb.DMatrix(x_train, label=y_train)
          validation_data = xgb.DMatrix(x_eval, label=y_eval)
          evallist = [(train_data, 'train'), (validation_data, 'eval')]
          num_round=2000
          bst = xgb.train(param, train_data, num_round, evallist, early_stopping_rounds=200)        
          best_iteration = bst.best_iteration
          train_error = float(bst.attributes()['best_msg'].split('\t')[1].split(':')[1])
          val_error =  float(bst.attributes()['best_msg'].split('\t')[2].split(':')[1])

        model_list.append(bst)
        best_iterations.append(best_iteration)
        train_errors.append(train_error)
        val_errors.append(val_error)

    if type == 'CAT':
      row = dict(depth = param['depth'], learning_rate = param['learning_rate'], 
              val_error = np.mean(val_errors), train_error = np.mean(train_errors),
              iter_0 = best_iterations[0], iter_1 = best_iterations[1],
              iter_2 = best_iterations[2], iter_3 = best_iterations[3], 
              iter_4 = best_iterations[4])
      
    elif type == 'XG':
      row = dict(max_depth = param['max_depth'], eta = param['eta'], 
              val_error = np.mean(val_errors), train_error = np.mean(train_errors),
              iter_0 = best_iterations[0], iter_1 = best_iterations[1],
              iter_2 = best_iterations[2], iter_3 = best_iterations[3], 
              iter_4 = best_iterations[4])
      
    row_list.append(row)
    models_list.append(model_list)

  df_cv = pd.DataFrame(row_list)

  return df_cv, models_list

def BoostTestModel(X_test, Y_test, model_list, type='XG'):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
      if type == 'CAT':
        ypred = bst.predict(data, ntree_start=0, ntree_end=bst.get_best_iteration())
      elif type == 'XG':
        ypred = bst.predict(xgb.DMatrix(data), ntree_limit=bst.best_iteration)
      five_fold_pred[i] = ypred

    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [20]:
param_grid = {'max_depth':[6], 
         'eta':[0.04], 'objective':['binary:logistic'] }

df_cv, models_xg = BoostGridSearch(X_train, Y_train, param_grid)


0-th model is training:
[0]	train-error:0.078275	eval-error:0.0847
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 200 rounds.
[1]	train-error:0.077475	eval-error:0.083
[2]	train-error:0.078175	eval-error:0.0825
[3]	train-error:0.077025	eval-error:0.0811
[4]	train-error:0.07685	eval-error:0.0817
[5]	train-error:0.076375	eval-error:0.0823
[6]	train-error:0.07655	eval-error:0.0827
[7]	train-error:0.076225	eval-error:0.0819
[8]	train-error:0.076	eval-error:0.0827
[9]	train-error:0.0761	eval-error:0.0829
[10]	train-error:0.076275	eval-error:0.0827
[11]	train-error:0.075825	eval-error:0.083
[12]	train-error:0.0762	eval-error:0.0842
[13]	train-error:0.076	eval-error:0.0837
[14]	train-error:0.075975	eval-error:0.0828
[15]	train-error:0.07595	eval-error:0.0825
[16]	train-error:0.07585	eval-error:0.0828
[17]	train-error:0.076	eval-error:0.0828
[18]	train-error:0.075975	eval-error:0.0826
[19]	train-error:0.0759

In [21]:
df_cv.sort_values(['val_error'],ascending=True)

Unnamed: 0,max_depth,eta,val_error,train_error,iter_0,iter_1,iter_2,iter_3,iter_4
0,6,0.04,0.07928,0.06956,59,120,71,391,191


In [22]:
BoostTestModel(X_test, Y_test, models_xg[0])

0.9178

In [23]:
xgboost_clf = models_xg[0] 

In [24]:
param_grid = {'depth':[7], 
         'learning_rate':[0.01], 'iterations':[2000],
         'loss_function':['Logloss'], 'verbose':[True]}

df_cv, models_cat = BoostGridSearch(X_train, Y_train, param_grid, type='CAT')

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
740:	learn: 0.1832119	test: 0.1969529	best: 0.1969511 (735)	total: 16s	remaining: 27.2s
741:	learn: 0.1831956	test: 0.1969531	best: 0.1969511 (735)	total: 16s	remaining: 27.2s
742:	learn: 0.1831658	test: 0.1969443	best: 0.1969443 (742)	total: 16.1s	remaining: 27.2s
743:	learn: 0.1831516	test: 0.1969423	best: 0.1969423 (743)	total: 16.1s	remaining: 27.2s
744:	learn: 0.1831245	test: 0.1969480	best: 0.1969423 (743)	total: 16.1s	remaining: 27.1s
745:	learn: 0.1831036	test: 0.1969508	best: 0.1969423 (743)	total: 16.1s	remaining: 27.1s
746:	learn: 0.1830872	test: 0.1969468	best: 0.1969423 (743)	total: 16.1s	remaining: 27.1s
747:	learn: 0.1830671	test: 0.1969375	best: 0.1969375 (747)	total: 16.2s	remaining: 27.1s
748:	learn: 0.1830523	test: 0.1969273	best: 0.1969273 (748)	total: 16.2s	remaining: 27.1s
749:	learn: 0.1830304	test: 0.1969309	best: 0.1969273 (748)	total: 16.2s	remaining: 27s
750:	learn: 0.1830111	test: 0.1969351	best: 0.1969273 (748)	total

In [25]:
df_cv.sort_values(['val_error'],ascending=True)

Unnamed: 0,depth,learning_rate,val_error,train_error,iter_0,iter_1,iter_2,iter_3,iter_4
0,7,0.01,0.196372,0.174423,1502,1091,1035,1260,1523


In [26]:
BoostTestModel(X_test, Y_test, models_cat[0],type='CAT')


0.91858

In [27]:
catboost_clf = models_cat[0]

# 神经网络

In [58]:
from torch.utils.data import Dataset, DataLoader

class MyDataSet(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y
        self.len = x.shape[0]
        
    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]
    
    def __len__(self):
        return self.len

from sklearn.metrics import accuracy_score
def test_model(X_test, Y_test, model_list):
    data = DataLoader(TestDataSet(X_test), batch_size = 50000, num_workers=4)
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(dataloaders = data)
        #print(ypred)
        y_hat = ypred[0].cpu()
        #print(y_hat)
        five_fold_pred[i] = y_hat.numpy()
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [29]:
import torch.nn as nn
import torch
import torch.nn.functional as F


def mish(input):

    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):

    def __init__(self):
        '''
        Init method.
        '''
        super().__init__()

    def forward(self, input):
        '''
        Forward pass of the function.
        '''
        return mish(input)

class MLPLayer(nn.Module):
    def __init__(self, dim_in, dim_out, res_coef = 0, dropout_p = 0.1):
        super().__init__()
        self.linear  = nn.Linear(dim_in, dim_out)
        self.res_coef = res_coef
        self.activation = Mish()
        self.dropout = nn.Dropout(dropout_p)
        self.ln = nn.LayerNorm(dim_out)
    
    def forward(self, x):
        y = self.linear(x)
        y = self.activation(y)
        y = self.dropout(y)
        if self.res_coef == 0:
            return self.ln(y)
        else:
            return self.ln(self.res_coef*x +y )

       
class MyNetwork(nn.Module):
    def __init__(self, dim_in, dim, res_coef=0.5, dropout_p = 0.1, n_layers = 10):
        super().__init__()
        self.mlp = nn.ModuleList()
        self.first_linear = MLPLayer(dim_in, dim)
        self.n_layers = n_layers
        for i in range(n_layers):
            self.mlp.append(MLPLayer(dim, dim, res_coef, dropout_p))
        self.final = nn.Linear(dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.first_linear(x)
        for layer in self.mlp:
            x = layer(x)
        x= self.sigmoid(self.final(x))
        return x.squeeze()

In [30]:
# coding = 'utf-8'
import numpy as np
import pandas as pd
import tqdm

def encode_label(x):
    unique=sorted(list(set([str(item) for item in np.unique(x)])))
    kv = {unique[i]: i for i in range(len(unique))}
    vfunc = np.vectorize(lambda x: kv[str(x)])
    return vfunc(x)

def encode_label_mat(x):
    _, ncol = x.shape
    result = np.empty_like(x, dtype=int)
    for col in range(ncol):
        result[:,col] = encode_label(x[:, col])
    return result

def impute_nan(x, method='median'):
    _, ncol = x.shape
    result = np.empty_like(x)

    for col in range(ncol):
        if method == 'median':
            data = x[:, col]
            impute_value = np.median(data[~pd.isnull(data) & (data != np.inf) & (data != -np.inf)])
        else:
            raise NotImplementedError()

        func = np.vectorize(lambda x: impute_value if pd.isnull(x) else x)
        result[:, col] = func(x[:, col])
    return result


def get_uniform_interval(minimum, maximum, nbins):
    result = [minimum]
    step_size = (float(maximum - minimum)) / nbins
    for index in range(nbins - 1):
        result.append(minimum + step_size * (index + 1))
    result.append(maximum)
    return result


def get_interval_v2(x, sorted_intervals):
    if pd.isnull(x):
        return -1
    if x == np.inf:
        return -2
    if x == -np.inf:
        return -3
    interval = 0
    found = False
    sorted_intervals.append(np.inf)
    while not found and interval < len(sorted_intervals) - 1:
        if sorted_intervals[interval] <= x < sorted_intervals[interval + 1]:
            return interval
        else:
            interval += 1


def get_quantile_interval(data, nbins):
    quantiles = get_uniform_interval(0, 1, nbins)
    return list(np.quantile(data[(~pd.isnull(data)) & (data != np.inf) & (data != -np.inf)], quantiles))


def discretize(x, nbins=20):
    nrow, ncol = x.shape
    result = np.empty_like(x)
    interval_list = list()
    for col in range(ncol):
        intervals = sorted(list(set(get_quantile_interval(x[:, col], nbins))))
        interval_centroid = list()

        for i in range(len(intervals) - 1):
            interval_centroid.append(0.5 * (intervals[i] + intervals[i + 1]))
        func = np.vectorize(lambda x: get_interval_v2(x, intervals))
        result[:, col] = encode_label(func(x[:, col]))
        interval_list.append(interval_centroid)
    return result.astype(np.int64), interval_list

def get_var_type(df):
    columns = df.columns
    continuous_vars = [x for x in columns if x.startswith('continuous_')]
    discrete_vars = [x for x in columns if x.startswith('discrete_')]
    other_vars = list()
    for column in columns:
        if column not in continuous_vars and column not in discrete_vars:
            other_vars.append(column)
    return {'continuous': continuous_vars,
            'discrete': discrete_vars,
            'other': other_vars}


def get_cont_var(df):
    var_types = get_var_type(df)
    return var_types['continuous']


def get_dis_var(df):
    var_types = get_var_type(df)
    return var_types['discrete']

def drop_const_var(data):
    result = data.copy(deep=True)
    for col in data.columns:
        if len(data.loc[~pd.isnull(data[col]), col].unique()) <= 1:
            result.drop(columns=col, inplace=True)
    return result

In [31]:
# 对输入神经网络的数据做归一化处理.(注意是对训练集fit，对测试集transform？)
from sklearn.preprocessing import MinMaxScaler
trans = MinMaxScaler()
trans.fit(X_train_dl)
X_train_scaled = trans.transform(X_train_dl)
X_test_scaled = trans.transform(X_test_dl)

X_train_scaled = X_train_scaled.astype(np.float32)
X_test_scaled = X_test_scaled.astype(np.float32)

x = np.concatenate([X_train_scaled, X_test_scaled])

x_dis, centroids = discretize(x)
x_dis_train = x_dis[:50000, :]
x_dis_test = x_dis[50000:,:]


In [32]:
class EmbeddingFactory(nn.Module):
    def __init__(self, x, dim_out):
        super().__init__()
        print('This is EmbeddingFactory')
        self.dim_out = dim_out
        self.module_list = nn.ModuleList(
            [nn.Embedding(len(set(np.unique(x[:, col]))), dim_out) for col in range(x.shape[1])])

    def forward(self, x):
        result = [self.module_list[col](x[:, col]).unsqueeze(2) for col in range(x.shape[1])]
        return torch.cat(result, dim=2)

In [33]:
from sklearn.model_selection import KFold
seed = 42 # for the same data division
kf = KFold(n_splits=5, random_state=seed,shuffle=True)

five_fold_data = []
for train_index, eval_index in kf.split(x_dis_train):
    x_train, x_eval = x_dis_train[train_index], x_dis_train[eval_index]
    y_train, y_eval = Y_train.values[train_index], Y_train.values[eval_index]
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])


In [34]:
from einops import rearrange, reduce, repeat

class TrainingModule(pl.LightningModule):
    def __init__(self, embedding, x, dim_emb, dim_mlp, res_coef=0, dropout_p=0, n_layers=10, learning_rate=1e-4):
        super().__init__()
        self.embedding = embedding(x, dim_emb) #通过参数选择不同的embeddingFactory
        self.backbone = MyNetwork(x.shape[1]*dim_emb, dim_mlp, res_coef, dropout_p, n_layers)
        self.loss = nn.BCELoss()
        self.accuracy = Accuracy()
        self.learning_rate = learning_rate
        
    def forward(self, x):
        x = self.embedding(x)
        x = rearrange(x, "b h e -> b (h e)")  ##
        return self.backbone(x)

    # def test_step(self, batch, batch_idx):
    #     # metrics = self.validation_step(batch, batch_idx)
    #     # metrics = {'test_acc': metrics[1], 'test_loss': metrics[0]}
    #     # self.log_dict(metrics)

    #     x, y = batch
    #     x = self.embedding(x)
    #     x = rearrange(x, "b h e -> b (h e)")
    #     x = self.backbone(x)
    #     return x

    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = self.embedding(x)
        x = rearrange(x, "b h e -> b (h e)")
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        # self.log("Validation loss", loss)
        # self.log("Validation acc", acc)

        self.log('validation_loss', loss, on_epoch=True, logger=True)
        self.log('validation_acc', acc, on_epoch=True, logger=True)

        return loss, acc

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self.embedding(x)
        x = rearrange(x, "b h e -> b (h e)")
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        # self.log("Training loss", loss)
        # self.log("Training acc", acc)

        # logs metrics for each training_step,
        # and the average across the epoch, to the progress bar and logger
        self.log('train_loss', loss, on_epoch=True, logger=True)
        self.log('train_acc', acc, on_epoch=True, logger=True)

        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        return optimizer

    # learning rate warm-up
    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
                      optimizer_closure, on_tpu, using_native_amp, using_lbfgs):
        # warm up lr
        if self.trainer.global_step < 500:
            lr_scale = min(1., float(self.trainer.global_step + 1) / 500.)
            for pg in optimizer.param_groups:
                pg['lr'] = lr_scale * self.learning_rate

        # update params
        optimizer.step(closure=optimizer_closure)

In [35]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change b type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed May 19 12:40:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    32W / 250W |    291MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [40]:
model_list = []
#val_results = []
for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
    print('{}-th model is training:'.format(idx))
    train_dataloader = DataLoader(MyDataSet(x_train, y_train), batch_size = 100, num_workers=4)
    test_dataloader = DataLoader(MyDataSet(x_eval, y_eval), batch_size = 100, num_workers=4)

    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    training_module = TrainingModule(EmbeddingFactory,x_dis, 16, 48, 0.5, 0.1, 12) 
    trainer = pl.Trainer(max_epochs=20, gpus=1, progress_bar_refresh_rate=100, val_check_interval=400, logger=tb_logger)
    trainer.fit(training_module, train_dataloader, test_dataloader)
    #val_result = trainer.test(test_dataloaders=test_dataloader)

    model_list.append(trainer)
    #val_results.append(val_result)



  cpuset_checked))
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | EmbeddingFactory | 5.8 K 
1 | backbone  | MyNetwork        | 52.6 K
2 | loss      | BCELoss          | 0     
3 | accuracy  | Accuracy         | 0     
-----------------------------------------------
58.4 K    Trainable params
0         Non-trainable params
58.4 K    Total params
0.234     Total estimated model params size (MB)


0-th model is training:
This is EmbeddingFactory


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | EmbeddingFactory | 5.8 K 
1 | backbone  | MyNetwork        | 52.6 K
2 | loss      | BCELoss          | 0     
3 | accuracy  | Accuracy         | 0     
-----------------------------------------------
58.4 K    Trainable params
0         Non-trainable params
58.4 K    Total params
0.234     Total estimated model params size (MB)



1-th model is training:
This is EmbeddingFactory


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | EmbeddingFactory | 5.8 K 
1 | backbone  | MyNetwork        | 52.6 K
2 | loss      | BCELoss          | 0     
3 | accuracy  | Accuracy         | 0     
-----------------------------------------------
58.4 K    Trainable params
0         Non-trainable params
58.4 K    Total params
0.234     Total estimated model params size (MB)



2-th model is training:
This is EmbeddingFactory


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | EmbeddingFactory | 5.8 K 
1 | backbone  | MyNetwork        | 52.6 K
2 | loss      | BCELoss          | 0     
3 | accuracy  | Accuracy         | 0     
-----------------------------------------------
58.4 K    Trainable params
0         Non-trainable params
58.4 K    Total params
0.234     Total estimated model params size (MB)



3-th model is training:
This is EmbeddingFactory


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | EmbeddingFactory | 5.8 K 
1 | backbone  | MyNetwork        | 52.6 K
2 | loss      | BCELoss          | 0     
3 | accuracy  | Accuracy         | 0     
-----------------------------------------------
58.4 K    Trainable params
0         Non-trainable params
58.4 K    Total params
0.234     Total estimated model params size (MB)



4-th model is training:
This is EmbeddingFactory


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [59]:
test_model(x_dis_test,Y_test.values,model_list)

  cpuset_checked))
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…




0.91612

In [41]:
mlp_clf = model_list

# 模型Stacking

In [42]:
estimators = {"GBM": lightgbm_clf,
        "XG": xgboost_clf,
        "CAT": catboost_clf,
        "MLP": mlp_clf}


In [86]:
from torch.utils.data import Dataset, DataLoader

class TestDataSet(Dataset):
    def __init__(self, x):
        super().__init__()
        self.x = x
        self.len = x.shape[0]
        
    def __getitem__(self, idx):
        return self.x[idx, :]
    
    def __len__(self):
        return self.len

#生成Stacking训练特征
def MetaFeaturesPrepare(X_train, Y_train, estimators):

    # split data for five fold
    five_fold_data = []
    for train_index, eval_index in kf.split(X_train):
        x_train, x_eval = X_train.loc[train_index], X_train.loc[eval_index]
        y_train, y_eval = Y_train.loc[train_index], Y_train.loc[eval_index]
        five_fold_data.append((x_eval, y_eval))

    # meta_feature = []
    # meta_features = np.empty((len(X_train), len(estimators)), dtype=np.float32)
    meta_features = pd.DataFrame()
    meta_feature = pd.DataFrame()
    # 
    meta_features = {'CAT':[],'XG':[],'GBM':[],'MLP':[],'Y':[]}

    for idx, (x_eval, y_eval) in enumerate(five_fold_data):

        for key, model_list in estimators.items():
            model = model_list[idx]

            if key == 'CAT':
                prob = model.predict_proba(x_eval, ntree_start=0, ntree_end=model.get_best_iteration())
                meta_features[key].extend(prob[::, 1::].flatten())
            elif key == 'XG':
                meta_features[key].extend(model.predict(xgb.DMatrix(x_eval), ntree_limit=model.best_iteration))
            elif key == 'GBM':
                meta_features[key].extend(model.predict(x_eval, num_iteration=model.best_iteration))
            
        meta_features['Y'].extend(y_eval.values)

    # 临时增加对MLP的处理
    five_fold_data = []
    for train_index, eval_index in kf.split(x_dis_train):
        x_train, x_eval = x_dis_train[train_index], x_dis_train[eval_index]
        y_train, y_eval = Y_train.loc[train_index], Y_train.loc[eval_index]
        five_fold_data.append((x_eval, y_eval))
    
    model_list = estimators['MLP']
    for idx, (x_eval, y_eval) in enumerate(five_fold_data):
        model = model_list[idx]
        data = DataLoader(TestDataSet(x_eval), batch_size = 10000, num_workers=4)
        ypred = model.predict(dataloaders=data)
        y_hat = ypred[0].cpu()
        ypred = y_hat.numpy()
        ypred = ypred.tolist()
        print(ypred)
        meta_features['MLP'].extend(ypred)

    return pd.DataFrame(meta_features)

#测试数据准备
def TestPredictions(X_test, estimators):

    X_test_predictions = pd.DataFrame()

    for key, model_list in estimators.items():
      
        five_fold_pred = np.zeros((5, len(X_test)))
        for i, bst in enumerate(model_list):
            if key == 'CAT':
                prob = bst.predict_proba(X_test, ntree_start=0, ntree_end=bst.get_best_iteration())
                ypred = prob[::, 1::].flatten()

                #ypred = bst.predict(X_test, ntree_start=0, ntree_end=bst.get_best_iteration())
            elif key == 'XG':
                ypred = bst.predict(xgb.DMatrix(X_test), ntree_limit=bst.best_iteration)
            elif key == 'GBM':
                ypred = bst.predict(X_test, num_iteration=bst.best_iteration)
            elif key == 'MLP':
                data = DataLoader(TestDataSet(x_dis_test), batch_size = 50000, num_workers=4)
                ypred = bst.predict(dataloaders=data)
                y_hat = ypred[0].cpu()
                ypred = y_hat.numpy()

            five_fold_pred[i] = ypred

        X_test_predictions[key] = five_fold_pred.mean(axis=-2)
    return X_test_predictions

In [87]:
# blender训练数据
meta_features = MetaFeaturesPrepare(X_train, Y_train, estimators)

X_val_predictions = meta_features.drop('Y', axis=1)
Y_val = meta_features['Y']


  cpuset_checked))
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



[0.9959561228752136, 0.9868292212486267, 0.9958305954933167, 0.9961305856704712, 0.995598554611206, 0.038052212446928024, 0.9961368441581726, 0.02488199807703495, 0.9959635734558105, 0.10352528095245361, 0.9958717226982117, 0.14600393176078796, 0.9872827529907227, 0.9950738549232483, 0.9958129525184631, 0.9960616230964661, 0.9931383728981018, 0.9959083795547485, 0.9957922697067261, 0.9953142404556274, 0.995753288269043, 0.9952817559242249, 0.8486951589584351, 0.9956791996955872, 0.9678351879119873, 0.9956091046333313, 0.9959905743598938, 0.9939815402030945, 0.9959575533866882, 0.9959043860435486, 0.3863725960254669, 0.9933258295059204, 0.9939921498298645, 0.9959841966629028, 0.07920142263174057, 0.994795024394989, 0.995969295501709, 0.8634770512580872, 0.9956350922584534, 0.9953761100769043, 0.8827940225601196, 0.9956207871437073, 0.9952859282493591, 0.9951381087303162, 0.9961295127868652, 0.9952935576438904, 0.9826198816299438, 0.9958482980728149, 0.9959506988525391, 0.98923605680465

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



[0.5207613110542297, 0.9947704672813416, 0.9958710074424744, 0.9955149292945862, 0.9916175007820129, 0.9760541915893555, 0.9957364797592163, 0.9958808422088623, 0.9959194660186768, 0.9958471655845642, 0.9947903156280518, 0.9587787985801697, 0.04235854372382164, 0.6554511189460754, 0.9666710495948792, 0.08178991824388504, 0.4189154803752899, 0.9955162405967712, 0.10281693190336227, 0.6462779641151428, 0.989847719669342, 0.9956451654434204, 0.9861900806427002, 0.9954409599304199, 0.9947072267532349, 0.9811298251152039, 0.08458629995584488, 0.9952846169471741, 0.9956347346305847, 0.986823320388794, 0.2464544028043747, 0.6560363173484802, 0.13992397487163544, 0.9907830953598022, 0.9955821633338928, 0.9950739741325378, 0.9958006739616394, 0.9957485795021057, 0.19038964807987213, 0.9951359629631042, 0.9910699129104614, 0.995733916759491, 0.9957793951034546, 0.9955150485038757, 0.9957237243652344, 0.03185650333762169, 0.992637038230896, 0.3443087041378021, 0.5041424632072449, 0.9958718419075

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



[0.9947445392608643, 0.9931821227073669, 0.9957450032234192, 0.015967952087521553, 0.9590134620666504, 0.9957616925239563, 0.9963405132293701, 0.4198015034198761, 0.02481645718216896, 0.9943524599075317, 0.021020108833909035, 0.9960337281227112, 0.9945259094238281, 0.996273398399353, 0.9944567680358887, 0.9501832127571106, 0.9962524771690369, 0.9962561130523682, 0.9942954182624817, 0.13871650397777557, 0.9957077503204346, 0.9962028861045837, 0.9944685697555542, 0.9945726990699768, 0.996141254901886, 0.47040286660194397, 0.9960059523582458, 0.9962737560272217, 0.9875677824020386, 0.9958838820457458, 0.6821042895317078, 0.9962080717086792, 0.23812563717365265, 0.5991763472557068, 0.9959384202957153, 0.01383023802191019, 0.9963332414627075, 0.04052167758345604, 0.9937082529067993, 0.9934746623039246, 0.996250331401825, 0.9948215484619141, 0.995784342288971, 0.9946046471595764, 0.9430616497993469, 0.9931455850601196, 0.9924964904785156, 0.8891053199768066, 0.030612662434577942, 0.71116805

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



[0.9899685978889465, 0.7446544766426086, 0.9967225193977356, 0.9964320659637451, 0.9967401623725891, 0.9967482089996338, 0.9966939687728882, 0.9964674711227417, 0.7862773537635803, 0.018541080877184868, 0.9958553910255432, 0.9967018961906433, 0.027832448482513428, 0.9965188503265381, 0.9962975382804871, 0.9965304732322693, 0.9967407584190369, 0.9962771534919739, 0.9967404007911682, 0.9961354732513428, 0.9964715242385864, 0.996217668056488, 0.996541440486908, 0.1315377652645111, 0.9965698719024658, 0.9966323971748352, 0.9967448711395264, 0.9959256649017334, 0.9935562014579773, 0.9967367053031921, 0.9940767288208008, 0.9850409626960754, 0.9958397150039673, 0.9966928958892822, 0.9966703057289124, 0.9964427351951599, 0.4171600341796875, 0.05019843950867653, 0.995996356010437, 0.6622216701507568, 0.9967548251152039, 0.9965032339096069, 0.9966976642608643, 0.996729850769043, 0.9967554211616516, 0.9965624213218689, 0.26088517904281616, 0.9967619180679321, 0.9965883493423462, 0.99674773216247

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


[0.993871808052063, 0.995403528213501, 0.1902964562177658, 0.995355486869812, 0.995093822479248, 0.19781379401683807, 0.9952459931373596, 0.9957236051559448, 0.2973160445690155, 0.9956176280975342, 0.9958211183547974, 0.10482586920261383, 0.6034235954284668, 0.9943618774414062, 0.18878203630447388, 0.9962343573570251, 0.9959747195243835, 0.9928487539291382, 0.9953881502151489, 0.9925011992454529, 0.9879176020622253, 0.9943130016326904, 0.9917750358581543, 0.9946491122245789, 0.1096874549984932, 0.08438332378864288, 0.9958831071853638, 0.9935266971588135, 0.9954012632369995, 0.08687574416399002, 0.9935564398765564, 0.9962887763977051, 0.9949111342430115, 0.11885388195514679, 0.996179461479187, 0.9960164427757263, 0.9950413107872009, 0.17957530915737152, 0.9826743006706238, 0.9936047196388245, 0.9949302077293396, 0.8138213157653809, 0.5748879313468933, 0.9951983094215393, 0.9122006297111511, 0.9958429932594299, 0.9872655868530273, 0.856569766998291, 0.3487856090068817, 0.994646847248077

In [None]:
# blender测试数据
X_test_predictions = TestPredictions(X_test, estimators)

In [88]:
meta_features

Unnamed: 0,CAT,XG,GBM,MLP,Y
0,0.994220,0.948977,0.974143,0.995956,1
1,0.970828,0.917923,0.946057,0.986829,1
2,0.994877,0.948977,0.974723,0.995831,1
3,0.997162,0.948977,0.974723,0.996131,1
4,0.992318,0.944791,0.969591,0.995599,1
...,...,...,...,...,...
49995,0.994277,0.997947,0.997596,0.995837,1
49996,0.418561,0.558302,0.545451,0.112587,0
49997,0.993532,0.994110,0.989523,0.995766,1
49998,0.992120,0.995521,0.993827,0.991255,1


In [89]:
#这里blender用了LightGBM，也可以尝试其他如逻辑回归。。。

param_grid = {'num_thread': [4], 'metric': ['binary_error'], 'objective': ['binary'], 'num_round': [1000],
          'num_leaves': [16,17,32,33], 'learning_rate': [0.01,0.02,0.03], 
          'feature_fraction': [0.8], 'bagging_fraction': [0.8]}

df_cv, models = GridSearch(X_val_predictions, Y_val, param_grid)

0-th model is training:
[1]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
Training until validation scores don't improve for 200 rounds.
[2]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[3]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[4]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[5]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[6]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[7]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[8]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[9]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[10]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[11]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[12]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[13]	training's binary_error: 0.20365	valid_1's binary_error: 0.2066
[14]	training's binary_er



[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
[36]	training's binary_error: 0.0869	valid_1's binary_error: 0.0882
[37]	training's binary_error: 0.0861	valid_1's binary_error: 0.0879
[38]	training's binary_error: 0.08485	valid_1's binary_error: 0.0847
[39]	training's binary_error: 0.0839	valid_1's binary_error: 0.0836
[40]	training's binary_error: 0.0833	valid_1's binary_error: 0.0817
[41]	training's binary_error: 0.082325	valid_1's binary_error: 0.0808
[42]	training's binary_error: 0.082125	valid_1's binary_error: 0.0802
[43]	training's binary_error: 0.0819	valid_1's binary_error: 0.0801
[44]	training's binary_error: 0.08185	valid_1's binary_error: 0.0801
[45]	training's binary_error: 0.081675	valid_1's binary_error: 0.08
[46]	training's binary_error: 0.0812	valid_1's binary_error: 0.0796
[47]	training's binary_error: 0.080725	valid_1's binary_error: 0.0794
[48]	training's binary_error: 0.080725	valid_1's binary_error: 0.0792
[49]	training's binary_error: 0.0807	valid_1's binary_error: 0.07

In [90]:
df_cv.sort_values(['val_error'],ascending=True)

Unnamed: 0,num_leaves,learning_rate,val_error,train_error,iter_0,iter_1,iter_2,iter_3,iter_4
6,32,0.02,0.07902,0.077445,145,220,103,97,97
11,33,0.03,0.07926,0.07757,80,102,55,62,70
10,32,0.03,0.0793,0.07766,100,54,55,65,71
3,33,0.01,0.07944,0.07728,186,194,199,192,279
8,16,0.03,0.07944,0.078745,67,209,65,52,73
4,16,0.02,0.07948,0.07867,101,102,82,83,116
2,32,0.01,0.07954,0.07731,183,187,198,190,294
7,33,0.02,0.0796,0.077525,86,93,98,97,103
9,17,0.03,0.07964,0.078735,61,220,58,50,72
0,16,0.01,0.07964,0.07865,190,157,168,203,180


In [92]:
def test_model(X_test, Y_test, model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

test_model(X_test_predictions, Y_test, models[6])


0.91828