# For Google Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Import Packages :

In [None]:
# basic stuffs
import csv
import time
import sys
import os
import math
import random as rand
from typing import Dict

# other library
import numpy as np
import pandas as pd

# visualization tools
import tqdm
import matplotlib.pyplot as plt

# PyTorch library
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data 
from torch.utils.data import Dataset, DataLoader

# Fix Randomization Seed :

In [None]:
SEED = 42 # Do not modify
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
rand.seed(SEED)
np.random.seed(SEED)

#Parameters :

In [None]:
TIME_FRAME_SIZE = 5

#setting
pd.set_option('precision', 4)
pd.set_option("display.max_columns",100)

# load data from google cloud :

In [None]:
# Youchen datapath

# ccba = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/ccba.csv')
# custinfo = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/custinfo.csv')

# cdtx = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/cdtx.csv')
# dp = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/dp.csv')
# remit1 = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/remit.csv')

train_alert_date = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/train_x_alert_date.csv')
y = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/train_y_answer.csv')

training_data = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/1215/training_data_complete_5.csv')
training_label = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/1215/training_data_labels_5.csv')

testing_data = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/1215/testing_data_complete_5.csv')
testing_key = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/1215/testing_alert_key_5.csv')

sample_output = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/sample_output.csv')

# Perform Onehot Pooling 

In [None]:
# Drop columns
drop_cols = ['Unnamed: 0', 
      'remit_transtime_diff_1',	
      'remit_transtime_diff_2',	
      'remit_transtime_diff_3',	
      'remit_transtime_diff_4',	
      'remit_transtime_diff_5',	
      'remit_transtime_avg'
      ]
      
train = training_data.drop(columns=drop_cols)
test = testing_data.drop(columns=drop_cols)

train_label = training_label.drop(columns='Unnamed: 0')
test_keys = testing_key.drop(columns='Unnamed: 0')

In [None]:
# Determine categorical column names
cat_cols = ['occupation_code',
       'country_1', 'cur_type_1',
       'country_2', 'cur_type_2',
       'country_3', 'cur_type_3',
       'country_4', 'cur_type_4',
       'country_5', 'cur_type_5',
       'debit_credit_1', 'tx_type_1', 'info_asset_code_1', 'fiscTxId_1', 'txbranch_1', 'cross_bank_1', 'ATM_1',
       'debit_credit_2', 'tx_type_2', 'info_asset_code_2', 'fiscTxId_2', 'txbranch_2', 'cross_bank_2', 'ATM_2',
       'debit_credit_3', 'tx_type_3', 'info_asset_code_3', 'fiscTxId_3', 'txbranch_3', 'cross_bank_3', 'ATM_3',
       'debit_credit_4', 'tx_type_4', 'info_asset_code_4', 'fiscTxId_4', 'txbranch_4', 'cross_bank_4', 'ATM_4',
       'debit_credit_5', 'tx_type_5', 'info_asset_code_5', 'fiscTxId_5', 'txbranch_5', 'cross_bank_5', 'ATM_5',
       'trans_no_1', 'trans_no_2', 'trans_no_3', 'trans_no_4', 'trans_no_5'
       ]

# Determine pooling column names
pool_cols = ['country', 'cur_type', 'debit_credit', 'tx_type', 'info_asset_code',
        'fiscTxId', 'txbranch', 'cross_bank', 'ATM', 'trans_no']

In [None]:
# Get train/test onehot
train[cat_cols] = train[cat_cols].astype(int)
train_onehot = pd.get_dummies(train, columns=cat_cols)

test[cat_cols] = test[cat_cols].astype(int)
test_onehot = pd.get_dummies(test, columns=cat_cols)

In [None]:
# Get existing onehot keys -> count
#  'debit_credit': [0, 1],
#  'tx_type': [1, 2, 3]
count = {}
for pool in pool_cols:
  first = True
  for c in train.columns:
    if pool in c:
      index = train[c].value_counts().index
      if first:
        count[pool] = []
        for i in range(len(index)):
          count[pool].append(index[i])
        first = False
      else:
        for i in range(len(index)):
          if index[i] not in count[pool]:
            count[pool].append(index[i])
  count[pool] = sorted(count[pool])
# count

In [None]:
# Training set Pooling
train_col_names = []
train_col_datas = []
for key in count.keys():
  for value in count[key]:
    temp = None
    temp_name = key+'_'+str(value)
    for i in range(1, 6):
      col_name = key+'_'+str(i)+'_'+str(value)
      # if col_name not in train_onehot.columns:
      #   continue
      try:
        if temp is None:
          temp = train_onehot[col_name].copy()
        else:
          temp += train_onehot[col_name].copy()
      except:
        pass
    if temp is not None:
      train_col_datas.append(temp.to_numpy())
    else:
      train_col_datas.append(np.zeros(len(test)))
    train_col_names.append(temp_name)

# temp.to_numpy()
# temp_name

In [None]:
# Testing set Pooling
test_col_names = []
test_col_datas = []
for key in count.keys():
  for value in count[key]:
    temp = None
    temp_name = key+'_'+str(value)
    for i in range(1, 6):
      col_name = key+'_'+str(i)+'_'+str(value)
      # if col_name not in train_onehot.columns:
      #   continue
      try:
        if temp is None:
          temp = test_onehot[col_name].copy()
        else:
          temp += test_onehot[col_name].copy()
      except:
        pass
    if temp is not None:
      test_col_datas.append(temp.to_numpy())
    else:
      test_col_datas.append(np.zeros(len(test)).astype(int))
    test_col_names.append(temp_name)

In [None]:
# Categorical features
all_cat = pd.DataFrame(np.array(train_col_datas).T, columns=train_col_names)
test_cat = pd.DataFrame(np.array(test_col_datas).T, columns=test_col_names)

# Numerical features
all_num = train.drop(columns=cat_cols)
test_num = test.drop(columns=cat_cols)

# To ensure same dim when rerunning
all_label = training_label.drop(columns='Unnamed: 0')

# Train/val split
from sklearn.model_selection import train_test_split
train_cat, val_cat, train_num, val_num, train_label, val_label \
      = train_test_split(all_cat, all_num, all_label, test_size=0.2, random_state=42)

In [None]:
# Normalization of numerical feats
def normalize(X, preMax=None, preMin=None, is_train=True):
  if is_train:
    Max = X.max()
    Min = X.min()
    X_norm = (X - Min) / (Max - Min)
    
    return X_norm, Max, Min
  else:
    X_norm = (X - preMin) / (preMax - preMin)
    
    return X_norm, preMax, preMin

train_num_norm, train_max, train_min = normalize(train_num, is_train=True)
val_num_norm, _, _ = normalize(val_num, preMax=train_max, preMin = train_min, is_train=False)
test_num_norm, _, _ = normalize(test_num, preMax=train_max, preMin = train_min, is_train=False)

all_num_norm, _, _ = normalize(all_num, is_train=True)

In [None]:
train_set = pd.merge(train_cat.reset_index(), train_num_norm.reset_index()).drop(columns='index')
val_set = pd.merge(val_cat.reset_index(), val_num_norm.reset_index()).drop(columns='index')
test_set = pd.merge(test_cat.reset_index(), test_num_norm.reset_index()).drop(columns='index')

all_set = pd.merge(all_cat.reset_index(), all_num_norm.reset_index()).drop(columns='index')

# Resampling

In [None]:
train_x_y = pd.concat([train, train_label], axis=1)
train_x_y['labels'].value_counts()

0.0    18948
1.0      176
Name: labels, dtype: int64

In [None]:
from sklearn.utils import resample

def resampling(data, y_col, ratio='100_100'):
  ##################################################
  #  data: target DataFrame          #
  #  y_col: the name of y column     　#
  #  ratio: expected ratio of two classes .#
  ##################################################
  # String process
  [ratio_0, ratio_1] = ratio.split('_')
  ratio_0, ratio_1 = int(ratio_0), int(ratio_1)
  num_0, num_1 = data[y_col].value_counts()
  
  # Initialization
  n_samples = [0, 0]
  group = {}

  # Num of samples of class 0/1
  # (Switch n_samples values if minor class is 0)
  max_n = max(num_0, num_1)    # Bigger number of sample
  seg = max_n // 100       # Cut this number into 100 pieces
  n_samples[0] = num_0 if ratio_0 == 100 \
              else seg * ratio_0  # Use all samples if ratio_0 == 100, downsample if not
  n_samples[1] = seg * ratio_1  # Oversampling of minor class
  
  # Resample
  for i in [0, 1]:
    g = data[data[y_col] == i]
    group[str(i)] = resample(g, replace=True, n_samples=n_samples[i])

  # Concat two class into a DataFrame
  # (Shuffle as you wish)
  up = pd.concat(group.values())

  return up

# Youchen Part

## Data

In [None]:
# upsampled_data = resampling(train_set, 'labels', ratio='70_50')
# upsampled_data['labels'].value_counts()

In [None]:
!pip install xgboost==1.7.2
import xgboost as xgb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Reassign if resampled
# train_set = upsampled_data

In [None]:
train_X, train_y = train_set, train_label
val_X, val_y = val_set, val_label

all_X, all_y = all_set, all_label

## Training

In [None]:
from sklearn.metrics import recall_score

In [None]:
def recall_n(output, target):
    comb = list(zip(output, target))
    comb.sort(key=lambda x:x[0])
    flag = False
    for i, (out, gt) in enumerate(comb):
        if gt == 1:
            if flag:
                break
            flag = True
    
    return (sum(target)-1) / (len(target)-i)

In [None]:
# XGBoost

xgbModel = xgb.XGBClassifier(max_delta_step=1, random_state=0)
xgbModel.fit(train_X, train_y)

In [None]:
# XGBoost predict

# test = xgbModel.predict(val_X)
val_pred = xgbModel.predict_proba(val_X)
# train_pred = xgbModel.predict_proba(train_X)

In [None]:
print(recall_n(val_pred[:, 1].reshape(-1, 1), val_y.to_numpy()))
# val_pred[:, 1].reshape(-1, 1)
# val_y.shape
# print(val_y.to_numpy().shape)

[0.01536388]


In [None]:
# XGBoost grid search
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

xgbModel = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, 
                objective='binary:logistic', silent=True, nthread=1)

param_comb = 5
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search = RandomizedSearchCV(xgbModel, param_distributions=params, n_iter=param_comb, scoring='recall',
                  n_jobs=4, cv=skf.split(all_X, all_y), verbose=3, random_state=42)
random_search.fit(all_X, all_y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KeyboardInterrupt: ignored

In [None]:
random_search.best_score_

0.021367521367521364

In [None]:
test_keys['alert_key'].to_numpy()

array([352342, 352866, 352696, ..., 364673, 364626, 364986])

In [None]:
# Predict probability
output = []

# for i, _x in enumerate(random_search.predict_proba(test)):
for i, _x in enumerate(xgbModel.predict_proba(test_set)):
    output.append([test_keys.iloc[i].item(), _x[1]])
output = sorted(output, reverse=True, key= lambda s: s[1])
print(output)

# 考慮private alert key部分，滿足上傳條件
public_private_alert_key = sample_output['alert_key'].values
print(len(public_private_alert_key))

# For alert key not in public, add zeros
for key in public_private_alert_key:
  # print(key)
  if key not in test_keys['alert_key'].to_numpy():
    output.append([key, 0])

print(len(output))

predict_alert_key, predict_probability = [], []
for key, prob in output:
  predict_alert_key.append(key)
  predict_probability.append(prob)

df_predicted = pd.DataFrame({
    "alert_key": predict_alert_key,
    "probability": predict_probability
})

df_predicted.to_csv('prediction_baseline.csv', index=False)

[[358457, 0.39968827], [356628, 0.32680205], [361303, 0.28409693], [358988, 0.24368204], [358229, 0.24151605], [358005, 0.20350923], [361145, 0.15852714], [355403, 0.12102158], [355801, 0.1100383], [364628, 0.100587085], [356249, 0.09423577], [359785, 0.09423577], [362488, 0.09267317], [364673, 0.09059842], [355198, 0.088532686], [358721, 0.087355345], [362127, 0.0866188], [359499, 0.08656027], [363738, 0.08269125], [360601, 0.08065418], [355795, 0.07054985], [363033, 0.06937326], [364699, 0.067987196], [364223, 0.06753303], [353413, 0.06701272], [364033, 0.06602663], [357108, 0.063868135], [358252, 0.06047292], [361118, 0.060414225], [355436, 0.05628564], [353084, 0.056184895], [360534, 0.053050563], [355810, 0.052848704], [364698, 0.052553874], [353566, 0.051309414], [357098, 0.051035628], [359384, 0.050216094], [364626, 0.048046365], [355633, 0.047313035], [359370, 0.04520736], [364986, 0.044975065], [363771, 0.04371042], [364926, 0.043430302], [361836, 0.042960532], [353550, 0.0429