# Install TransBoost

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@ Code for Paper: "TransBoost: A Boosting-Tree Kernel Transfer Learning Algorithm for Improving Financial Inclusion" submitted to AAAI'22

1. Environment Requirements:
    Python: 3.6/3.7
    XGBoost: 1.4.2
    Pandas: 1.2.4
    
2. How to Install TransBoost?
    TransBoost algorithm is implemented based on the source code of XGBoost. You can find the orginal code of XGBoost on https://github.com/dmlc/xgboost).
    To install the algorithm:
     `python3 install.py`
3. How to Uninstall TransBoost?
    The experimental code of TransBoost will overwrite XGBoost temporary. We are working to solve the problem.    
    If you want to switch to original XGBoost, you need simply uninstall TransBoost and reinstall XGBoost by:
    `pip3 install --force-reinstall xgboost`

Note:
    Please note the following command works perfectly in Linux and MacOS; 
    However, if you are using Windows or other OS, some simple modification may be required.
"""

# Step1: Install XGBoost(ver.1.4.2),Pandas
# Please make sure the XGB version is 1.4.2 before installing TransBoost
!pip3 uninstall -y xgboost
!pip3 install xgboost==1.4.2
!pip3 install --force-reinstall pandas==1.2.4
# Step2: Install TransBoost
import os
import imp
xgb_path = imp.find_module("xgboost")[1]
# Backup files of XGBoost
os.system('cp '+xgb_path+'/__init__.py '+xgb_path+'/__init__.py.bak')
os.system('cp '+xgb_path+'/core.py '+xgb_path+'/core.py.bak')
os.system('cp '+xgb_path+'/sklearn.py '+xgb_path+'/sklearn.py.bak')
os.system('cp '+xgb_path+'/training.py '+xgb_path+'/training.py.bak')
# Install TransBoost
os.system('cp ./TransBoost/*.py '+xgb_path+'/')
print('Installation succeeded.')

#Step3: Uninstall
#!pip3 install --force-reinstall xgboost


## Experiment 1: Public Benchmark - Lending Club

### Data Preparation

In [None]:
'''
[1]Lending Club dataset:
    URL: https://www.kaggle.com/ethon0426/lending-club-20072020q1
'''
import numpy as np
import pandas as pd
import pickle
## Load data
club = pd.read_csv('./data/club.csv')

# Source Domain 2015; Target Domain 2016
source_club = club[club.year == 2015]
target_club = club[club.year == 2016]

source_club = club[club.purpose == 'medical']
target_club = club[club.purpose == 'car']

source_club = source_club.drop(['purpose', 'year'], axis=1)
target_club = target_club.drop(['purpose', 'year'], axis=1)

## Data
source = source_club
target = target_club

## parameter
size_tt = range(100,1100,100) #Sample Size - Target domain for Training
size_st = 5000 #Sample Size - Source domain for Training
size_test = 3000 #Sample Size - Target domian for Testing
source_test_size = 0.2
target_test_size = 0.75

### Model Evaluation

In [None]:
# Experiment: Evaluate the model given different sample size of traget domain for training

from xgboost import TransBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

result = []
for size in size_tt:
    target_train,target_test=train_test_split(target, test_size = target_test_size, random_state=952)
    source_train = source
    target_train = target_train.sample(n = 1000, random_state = 123)
    target_test = target_test.sample(n = size_test, random_state = 123)
    target_train = target_train.sample(n = size, random_state = 456)
    
    X_source_train=source_train.reset_index(drop = True).drop(['label'], axis = 1)
    Y_source_train=source_train.reset_index(drop = True).label
    X_target_train=target_train.reset_index(drop = True).drop(['label'], axis = 1)
    Y_target_train=target_train.reset_index(drop = True).label
    X_target_test=target_test.reset_index(drop = True).drop(['label'], axis = 1)
    Y_target_test=target_test.reset_index(drop = True).label

    # Data normalization
    from sklearn.preprocessing import MinMaxScaler
    min_max_scaler = MinMaxScaler()
    X_target_train = min_max_scaler.fit_transform(X_target_train)
    X_target_test = min_max_scaler.transform(X_target_test)
    X_source_train = min_max_scaler.transform(X_source_train)

    # TransBoostClassifier
    clf = TransBoostClassifier(max_depth=4,
                  learning_rate=0.1,
                  n_estimators=40,
                  min_child_weight=0,
                  reg_alpha=0.,
                  reg_lambda=1.,
                  objective='binary:logistic',
                  seed=1440,
                  transfer_decay_ratio = 2., 
                  transfer_velocity = 1. ,
                  transfer_rebalance = False,
                  transfer_min_leaf_size = 10,
                  transfer_prior_margin = 'mirror',
                  transfer_margin_estimation = 'firstorder',
                  verbosity = 0,
                  nthread=64)
    # Fit the model
    tb_model=clf.fit(X_source_train, Y_source_train, X_target_train,Y_target_train)
    y_pred_target_test=tb_model.predict_proba(X_target_test)[:,1]
    result.append([size,roc_auc_score(Y_target_test, y_pred_target_test)])

result=pd.DataFrame(result)
result.columns=['TargetDomianSize','AUC']
print(result)

## Experiment 2: Public Benchmark - Wine Quality

### Data Preparation

In [None]:
'''
[2]UCI wine quality dataset:
    URL: https://archive.ics.uci.edu/ml/datasets/wine+quality
    Source: Paulo Cortez, University of Minho, Guimarães, Portugal, http://www3.dsi.uminho.pt/pcortez
    A. Cerdeira, F. Almeida, T. Matos and J. Reis, Viticulture Commission of the Vinho Verde Region(CVRVV), Porto, Portugal
    @2009
'''
import numpy as np
import pandas as pd

## Load data
target_wine = pd.read_csv('./data/winequality-red.csv', sep = ';')
source_wine = pd.read_csv('./data/winequality-white.csv', sep = ';')
target_wine['label'] = [0 if t <=5 else 1 for t in target_wine['quality']]
source_wine['label'] = [0 if t <=5 else 1 for t in source_wine['quality']]
column_selected = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'label']
target_wine = target_wine[column_selected]
source_wine = source_wine[column_selected]

## Data
source = source_wine
target = target_wine

## parameter
size_tt = range(50,550,50) #Sample Size - Target domain for Training
source_test_size = 0.2
target_test_size = 0.6

### Model Evaluation

In [None]:
# Experiment: Evaluate the model given different sample size of traget domain for training

import xgboost
from imp import reload
reload(xgboost)
from xgboost import TransBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

result = []

for size in size_tt:

    # Train-Test split
    source_train,source_test=train_test_split(source, test_size = source_test_size, random_state=952)
    target_train,target_test=train_test_split(target, test_size = target_test_size, random_state=952)
    target_train = target_train.sample(n = size, random_state = 123)

    X_source_train=source_train.reset_index(drop = True).drop(['label'], axis = 1)
    Y_source_train=source_train.reset_index(drop = True).label
    X_source_test=source_test.reset_index(drop = True).drop(['label'], axis = 1)
    Y_source_test=source_test.reset_index(drop = True).label
    X_target_train=target_train.reset_index(drop = True).drop(['label'], axis = 1)
    Y_target_train=target_train.reset_index(drop = True).label
    X_target_test=target_test.reset_index(drop = True).drop(['label'], axis = 1)
    Y_target_test=target_test.reset_index(drop = True).label

    # Data normalization
    from sklearn.preprocessing import MinMaxScaler
    min_max_scaler = MinMaxScaler()
    X_target_train = min_max_scaler.fit_transform(X_target_train)
    X_target_test = min_max_scaler.transform(X_target_test)
    X_source_train = min_max_scaler.transform(X_source_train)
    X_source_test = min_max_scaler.transform(X_source_test)
    
    # TransBoostClassifier
    clf = TransBoostClassifier(max_depth=4,
                  learning_rate=0.1,
                  n_estimators=40,
                  min_child_weight=0,
                  reg_alpha=0.,
                  reg_lambda=1.,
                  objective='binary:logistic',
                  seed=1440,
                  transfer_decay_ratio = 2., 
                  transfer_velocity = 1. ,
                  transfer_rebalance = False,
                  transfer_min_leaf_size = 10,
                  transfer_prior_margin = 'mirror',
                  transfer_margin_estimation = 'firstorder',
                  verbosity = 0,
                  nthread=64)
    # Fit the model
    tb_model=clf.fit(X_source_train, Y_source_train, X_target_train,Y_target_train)
    y_pred_target_test=tb_model.predict_proba(X_target_test)[:,1]
    result.append([size,roc_auc_score(Y_target_test, y_pred_target_test)])

result=pd.DataFrame(result)
result.columns=['TargetDomianSize','AUC']
print(result)

# Restore your python environment

In [None]:
#Step3: Uninstall TransBoost
!pip3 install --force-reinstall xgboost
