# For Google Drive

In [265]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# Import Packages :

In [266]:
# basic stuffs
import csv
import time
import sys
import os
import math
import random as rand
from typing import Dict

# other library
import numpy as np
import pandas as pd

# visualization tools
import tqdm
import matplotlib.pyplot as plt

# PyTorch library
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data 
from torch.utils.data import Dataset, DataLoader

# Fix Randomization Seed :

In [267]:
SEED = 42 # Do not modify
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
rand.seed(SEED)
np.random.seed(SEED)

#Parameters :

In [268]:
TIME_FRAME_SIZE = 5

# #setting
# pd.set_option('precision', 4)
# pd.set_option("display.max_columns",100)

# load data from google cloud :

In [269]:
# Youchen datapath

# ccba = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/ccba.csv')
# custinfo = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/custinfo.csv')

# cdtx = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/cdtx.csv')
# dp = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/dp.csv')
# remit1 = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/remit.csv')

train_alert_date = pd.read_csv('./train_x_alert_date.csv')
y = pd.read_csv('./train_y_answer.csv')

training_data = pd.read_csv('./1215/training_data_complete_5.csv')
training_label = pd.read_csv('./1215/training_data_labels_5.csv')

testing_data = pd.read_csv('./1215/testing_data_complete_5.csv')
testing_key = pd.read_csv('./1215/testing_alert_key_5.csv')

sample_output = pd.read_csv('./sample_output.csv')

# Perform Onehot Pooling 

In [270]:
# Drop columns
drop_cols = ['Unnamed: 0', 
      'remit_transtime_diff_1',	
      'remit_transtime_diff_2',	
      'remit_transtime_diff_3',	
      'remit_transtime_diff_4',	
      'remit_transtime_diff_5',	
      'remit_transtime_avg'
      ]
      
train = training_data.drop(columns=drop_cols)
test = testing_data.drop(columns=drop_cols)

train_label = training_label.drop(columns='Unnamed: 0')
test_keys = testing_key.drop(columns='Unnamed: 0')

In [271]:
# Determine categorical column names
cat_cols = ['occupation_code',
       'country_1', 'cur_type_1',
       'country_2', 'cur_type_2',
       'country_3', 'cur_type_3',
       'country_4', 'cur_type_4',
       'country_5', 'cur_type_5',
       'debit_credit_1', 'tx_type_1', 'info_asset_code_1', 'fiscTxId_1', 'txbranch_1', 'cross_bank_1', 'ATM_1',
       'debit_credit_2', 'tx_type_2', 'info_asset_code_2', 'fiscTxId_2', 'txbranch_2', 'cross_bank_2', 'ATM_2',
       'debit_credit_3', 'tx_type_3', 'info_asset_code_3', 'fiscTxId_3', 'txbranch_3', 'cross_bank_3', 'ATM_3',
       'debit_credit_4', 'tx_type_4', 'info_asset_code_4', 'fiscTxId_4', 'txbranch_4', 'cross_bank_4', 'ATM_4',
       'debit_credit_5', 'tx_type_5', 'info_asset_code_5', 'fiscTxId_5', 'txbranch_5', 'cross_bank_5', 'ATM_5',
       'trans_no_1', 'trans_no_2', 'trans_no_3', 'trans_no_4', 'trans_no_5'
       ]

# Determine pooling column names
pool_cols = ['country', 'cur_type', 'debit_credit', 'tx_type', 'info_asset_code',
        'fiscTxId', 'txbranch', 'cross_bank', 'ATM', 'trans_no']

In [272]:
# Get train/test onehot
train[cat_cols] = train[cat_cols].astype(int)
train_onehot = pd.get_dummies(train, columns=cat_cols)

test[cat_cols] = test[cat_cols].astype(int)
test_onehot = pd.get_dummies(test, columns=cat_cols)

In [273]:
# Get existing onehot keys -> count
#  'debit_credit': [0, 1],
#  'tx_type': [1, 2, 3]
count = {}
for pool in pool_cols:
  first = True
  for c in train.columns:
    if pool in c:
      index = train[c].value_counts().index
      if first:
        count[pool] = []
        for i in range(len(index)):
          count[pool].append(index[i])
        first = False
      else:
        for i in range(len(index)):
          if index[i] not in count[pool]:
            count[pool].append(index[i])
  count[pool] = sorted(count[pool])
# count

In [274]:
# Training set Pooling
train_col_names = []
train_col_datas = []
for key in count.keys():
  for value in count[key]:
    temp = None
    temp_name = key+'_'+str(value)
    for i in range(1, 6):
      col_name = key+'_'+str(i)+'_'+str(value)
      # if col_name not in train_onehot.columns:
      #   continue
      try:
        if temp is None:
          temp = train_onehot[col_name].copy()
        else:
          temp += train_onehot[col_name].copy()
      except:
        pass
    if temp is not None:
      train_col_datas.append(temp.to_numpy())
    else:
      train_col_datas.append(np.zeros(len(test)))
    train_col_names.append(temp_name)

# temp.to_numpy()
# temp_name

In [275]:
# Testing set Pooling
test_col_names = []
test_col_datas = []
for key in count.keys():
  for value in count[key]:
    temp = None
    temp_name = key+'_'+str(value)
    for i in range(1, 6):
      col_name = key+'_'+str(i)+'_'+str(value)
      # if col_name not in train_onehot.columns:
      #   continue
      try:
        if temp is None:
          temp = test_onehot[col_name].copy()
        else:
          temp += test_onehot[col_name].copy()
      except:
        pass
    if temp is not None:
      test_col_datas.append(temp.to_numpy())
    else:
      test_col_datas.append(np.zeros(len(test)).astype(int))
    test_col_names.append(temp_name)

In [276]:
# Categorical features
all_cat = pd.DataFrame(np.array(train_col_datas).T, columns=train_col_names)
test_cat = pd.DataFrame(np.array(test_col_datas).T, columns=test_col_names)

# Numerical features
all_num = train.drop(columns=cat_cols)
test_num = test.drop(columns=cat_cols)

# To ensure same dim when rerunning
all_label = training_label.drop(columns='Unnamed: 0')

# Train/val split
from sklearn.model_selection import train_test_split
train_cat, val_cat, train_num, val_num, train_label, val_label \
      = train_test_split(all_cat, all_num, all_label, test_size=0.2, random_state=42)

In [277]:
# Normalization of numerical feats
def normalize(X, preMax=None, preMin=None, is_train=True):
  if is_train:
    Max = X.max()
    Min = X.min()
    X_norm = (X - Min) / (Max - Min)
    
    return X_norm, Max, Min
  else:
    X_norm = (X - preMin) / (preMax - preMin)
    
    return X_norm, preMax, preMin

train_num_norm, train_max, train_min = normalize(train_num, is_train=True)
val_num_norm, _, _ = normalize(val_num, preMax=train_max, preMin = train_min, is_train=False)
test_num_norm, _, _ = normalize(test_num, preMax=train_max, preMin = train_min, is_train=False)

all_num_norm, _, _ = normalize(all_num, is_train=True)

In [278]:
train_set = pd.merge(train_cat.reset_index(), train_num_norm.reset_index()).drop(columns='index')
val_set = pd.merge(val_cat.reset_index(), val_num_norm.reset_index()).drop(columns='index')
test_set = pd.merge(test_cat.reset_index(), test_num_norm.reset_index()).drop(columns='index')

all_set = pd.merge(all_cat.reset_index(), all_num_norm.reset_index()).drop(columns='index')

In [279]:
train_set

Unnamed: 0,country_1,country_4,country_5,country_6,country_7,country_8,country_10,country_13,country_15,country_16,...,dp_transtime_diff_4,dp_transtime_diff_5,dp_transtime_avg,dp_trans_num,trade_amount_usd_1,trade_amount_usd_2,trade_amount_usd_3,trade_amount_usd_4,trade_amount_usd_5,remit_trans_num
0,0,0,0,0,0,0,0,0,0,0,...,0.000359,0.000499,0.001449,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000166,0.003986,0.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000832,0.001691,1.0,0.033643,0.033644,0.047690,0.022863,0.024825,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0.003054,0.001829,0.015461,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.001087,1.0,0.034697,0.025981,0.064444,0.093803,0.070404,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19119,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.001449,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
19120,0,0,0,0,0,0,0,0,0,0,...,0.000180,0.000499,0.002899,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
19121,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000166,0.003986,0.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
19122,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.007006,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0


# Resampling

In [280]:
# train_x_y = pd.concat([train, train_label], axis=1)
train_x_y = pd.concat([train_set, train_label], axis=1)
train_x_y['labels'].value_counts()

0.0    18948
1.0      176
Name: labels, dtype: int64

In [281]:
from sklearn.utils import resample

def resampling(data, y_col, ratio='100_100'):
  ##################################################
  #  data: target DataFrame          #
  #  y_col: the name of y column     　#
  #  ratio: expected ratio of two classes .#
  ##################################################
  # String process
  [ratio_0, ratio_1] = ratio.split('_')
  ratio_0, ratio_1 = int(ratio_0), int(ratio_1)
  num_0, num_1 = data[y_col].value_counts()
  
  # Initialization
  n_samples = [0, 0]
  group = {}

  # Num of samples of class 0/1
  # (Switch n_samples values if minor class is 0)
  max_n = max(num_0, num_1)    # Bigger number of sample
  seg = max_n // 100       # Cut this number into 100 pieces
  n_samples[0] = num_0 if ratio_0 == 100 \
              else seg * ratio_0  # Use all samples if ratio_0 == 100, downsample if not
  n_samples[1] = seg * ratio_1  # Oversampling of minor class
  
  # Resample
  for i in [0, 1]:
    g = data[data[y_col] == i]
    group[str(i)] = resample(g, replace=True, n_samples=n_samples[i])

  # Concat two class into a DataFrame
  # (Shuffle as you wish)
  up = pd.concat(group.values())

  return up

## Data

In [282]:
upsampled_data = resampling(train_x_y, 'labels', ratio='70_50')
upsampled_data['labels'].value_counts()

0.0    13230
1.0     9450
Name: labels, dtype: int64

In [283]:
# !pip install xgboost==1.7.2
import xgboost as xgb

In [284]:
# Reassign if resampled
train_set = upsampled_data

In [285]:
# train_X, train_y = train_set, train_label
train_X, train_y = train_set.drop(columns=['labels']), train_set['labels']
# val_X, val_y = val_set, val_label

# all_X, all_y = all_set, all_label

## Training

In [286]:
from sklearn.metrics import recall_score

In [287]:
def recall_n(output, target):
    comb = list(zip(output, target))
    comb.sort(key=lambda x:x[0])
    flag = False
    for i, (out, gt) in enumerate(comb):
        if gt == 1:
            if flag:
                break
            flag = True
    
    return (sum(target)-1) / (len(target)-i)

In [288]:
# XGBoost

xgbModel = xgb.XGBClassifier(max_delta_step=1, random_state=0)
xgbModel.fit(train_X, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=1, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [289]:
# XGBoost predict

# test = xgbModel.predict(val_X)
# val_pred = xgbModel.predict_proba(val_X)
train_pred = xgbModel.predict_proba(train_X)

In [290]:
# print(recall_n(val_pred[:, 1].reshape(-1, 1), val_y.to_numpy()))
# val_pred[:, 1].reshape(-1, 1)
# val_y.shape
# print(val_y.to_numpy().shape)

In [291]:
# XGBoost grid search
params ={
        'min_child_weight':range(1,9,1),
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample':np.linspace(0.7,0.9,20),
        'colsample_bytree':np.linspace(0.5,0.98,10),
        'max_depth':range(2,10,1),
        'learning_rate':np.linspace(0.01,2,20),
        'n_estimators':range(400,1000,4),
        'max_delta_step':range(1, 10, 1)
        }

In [292]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

xgbModel = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, 
                objective='binary:logistic', silent=True, nthread=1)

param_comb = 30
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search = RandomizedSearchCV(xgbModel, param_distributions=params, n_iter=param_comb, scoring='recall',
                #   n_jobs=-1, cv=skf.split(all_X, all_y), verbose=3, random_state=42)
                n_jobs=-1, cv=skf.split(train_X, train_y), verbose=3, random_state=42)
random_search.fit(train_X, train_y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Parameters: { "silent" } are not used.



RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x0000028EF3346F20>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=None,
                                           importance_...
                                        'max_depth': range(2, 10),
                                        'min_child_weight': range(1, 9),
                                       

In [293]:
random_search.best_score_

0.9573544973544973

In [294]:
test_keys['alert_key'].to_numpy()

array([352342, 352866, 352696, ..., 364673, 364626, 364986], dtype=int64)

In [295]:
test_set

Unnamed: 0,country_1,country_4,country_5,country_6,country_7,country_8,country_10,country_13,country_15,country_16,...,dp_transtime_diff_4,dp_transtime_diff_5,dp_transtime_avg,dp_trans_num,trade_amount_usd_1,trade_amount_usd_2,trade_amount_usd_3,trade_amount_usd_4,trade_amount_usd_5,remit_trans_num
0,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000966,1.0,0.015602,0.015619,0.028630,0.022102,0.021981,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0.120711,0.000000,0.124290,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.001087,1.0,0.031389,0.031424,0.043878,0.043515,0.043794,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0.029819,0.008981,0.030318,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000966,1.0,0.022429,0.022515,0.062776,0.063918,0.063926,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1840,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.001449,1.0,0.005457,0.005503,0.007695,0.007946,0.007942,1.0
1841,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.002162,0.002778,1.0,0.033627,0.001750,0.026234,0.028641,0.027772,0.4
1842,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000966,1.0,0.024041,0.038803,0.043662,0.015159,0.044147,1.0
1843,0,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.001087,1.0,0.016800,0.018054,0.026234,0.028641,0.027772,0.0


In [297]:
# Predict probability
output = []

for i, _x in enumerate(random_search.predict_proba(test_set)):
# for i, _x in enumerate(xgbModel.predict_proba(test_set)):
    output.append([test_keys.iloc[i].item(), _x[1]])
output = sorted(output, reverse=True, key= lambda s: s[1])
print(output)

# 考慮private alert key部分，滿足上傳條件
public_private_alert_key = sample_output['alert_key'].values
# print(len(public_private_alert_key))

# For alert key not in public, add zeros
for key in public_private_alert_key:
  # print(key)
  if key not in test_keys['alert_key'].to_numpy():
    output.append([key, 0])

print(len(output))

predict_alert_key, predict_probability = [], []
for key, prob in output:
  predict_alert_key.append(key)
  predict_probability.append(prob)

df_predicted = pd.DataFrame({
    "alert_key": predict_alert_key,
    "probability": predict_probability
})

df_predicted.to_csv('tree-basedmodel with resample methods.csv', index=False)

[[354234, 0.9995567], [360271, 0.99773943], [352857, 0.99720556], [356951, 0.9961398], [361028, 0.99442506], [358276, 0.9917679], [358479, 0.99024874], [360645, 0.98194593], [362131, 0.9789929], [359860, 0.9762437], [361011, 0.97457314], [358452, 0.9603984], [355375, 0.9404244], [354447, 0.87987494], [354428, 0.82953346], [358462, 0.8216325], [358466, 0.80478793], [354044, 0.7991337], [364184, 0.74344605], [364995, 0.7306533], [354402, 0.71334314], [353198, 0.7127275], [361313, 0.69881], [360029, 0.5928765], [358242, 0.57965356], [355134, 0.44588494], [360090, 0.38485026], [357107, 0.38039356], [359656, 0.36934057], [354049, 0.31378505], [363682, 0.31033504], [356632, 0.30933553], [357676, 0.3062448], [356384, 0.3062448], [360037, 0.3062448], [358207, 0.3062448], [360827, 0.3062448], [364729, 0.3062448], [362458, 0.27500322], [362225, 0.2652083], [355559, 0.26433173], [353865, 0.2615748], [360278, 0.25419444], [352522, 0.25253427], [365008, 0.25015876], [354679, 0.209499], [354619, 0.2