# LightGBM Implementation

## Download Requirements

In [18]:
!pip install numerapi catboost xgboost lightgbm catboost
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd /content/LightGBM
!mkdir build
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
%cd /content/LightGBM/python-package
!sudo python setup.py install --precompile

Collecting catboost
  Downloading catboost-0.25.1-cp38-none-win_amd64.whl (66.9 MB)
Collecting lightgbm
  Downloading lightgbm-3.2.1-py3-none-win_amd64.whl (1.0 MB)
Collecting graphviz
  Downloading graphviz-0.16-py2.py3-none-any.whl (19 kB)
Collecting plotly
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=b3ca07c5b6c77be89da82f5ed413a02a93dda0fcd27c920a4ac3405872ad81e5
  Stored in directory: c:\users\edward tang\appdata\local\pip\cache\wheels\c4\a7\48\0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: retrying, plotly, graphviz, lightgbm, catboost
Successfully installed catboost-0.25.1 graphvi

## Import Requirements

In [1]:
import time
import numpy as np
import pandas as pd
import catboost as cb
import lightgbm as lgb
import numerapi, warnings
from sklearn import preprocessing
from sklearn.model_selection import KFold

# Numerapi Setup

In [2]:
public_id = 'QPRBQANL6KUXWTXPQLDHHJIZ2OYGEBNA'
secret_key = ':)'
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)
warnings.filterwarnings('ignore')

leaderboard = napi.get_leaderboard()
# check if a new round has started

try:
  if napi.check_new_round():
    print("Ready.")
  else:
    print("In progress.")
except:
  print("Not ready.")

In progress.


## Download Training and Tournament Data

In [3]:
# Download Training Data From Numerai
start = time.time()
print(f"[{time.asctime()}] Downloading the lastest training data set. Current round is: {numerapi.NumerAPI(verbosity='info').get_current_round()}...\n")
training_data = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz", header=0)
end = time.time()
print(f"[{time.asctime()}] Training dataset has been loaded. It took {end - start:0.2f} seconds")

# Download Tournament Data From Numerai
start = time.time()
print(f"[{time.asctime()}] Downloading the lastest tournament data set. Current round is: {numerapi.NumerAPI(verbosity='info').get_current_round()}...\n")
tournament_data = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz", header=0)
end = time.time()
print(f"[{time.asctime()}] Tournament dataset has been loaded. It took {end - start:0.2f} seconds") 

[Wed Apr 21 21:17:57 2021] Downloading the lastest training data set. Current round is: 260...

[Wed Apr 21 21:18:36 2021] Training dataset has been loaded. It took 39.02 seconds
[Wed Apr 21 21:18:36 2021] Downloading the lastest tournament data set. Current round is: 260...

[Wed Apr 21 21:20:37 2021] Tournament dataset has been loaded. It took 120.50 seconds


In [40]:
print(training_data)

                      id     era data_type  feature_intelligence1  \
0       n000315175b67977    era1     train                   0.00   
1       n0014af834a96cdd    era1     train                   0.00   
2       n001c93979ac41d4    era1     train                   0.25   
3       n0034e4143f22a13    era1     train                   1.00   
4       n00679d1a636062f    era1     train                   0.25   
...                  ...     ...       ...                    ...   
501803  nff6a8a8feaeeb52  era120     train                   0.50   
501804  nff6af62a0996372  era120     train                   1.00   
501805  nff9288983b8c040  era120     train                   0.75   
501806  nffaab4e1cacc4b1  era120     train                   0.25   
501807  nffba5460b572cfa  era120     train                   0.75   

        feature_intelligence2  feature_intelligence3  feature_intelligence4  \
0                        0.50                   0.25                   0.00   
1            

## Defining Preprocessing Functions

In [4]:
# label encode strings to id values
def labelencode(col, df_train, df_test):
  le = preprocessing.LabelEncoder()
  le.fit(list(df_train[col].values) + list(df_test[col].values))
  df_train[col] = le.transform(df_train[col])
  df_test[col] = le.transform(df_test[col])

# preprocess non-categorical columns to integers
def preprocess(df_orig):
  df = df_orig.copy()
  c = list(set(df.columns) - set(['era', 'data_type']))
  df[c] = (df[c] * 4).astype(np.int32)
  df['era'] = df['era'].astype('category')
  df['data_type'] = df['data_type'].astype('category')
  return df

# scaling for making output values between 0 and 1
def scale(arr, minv, maxv):
  return np.interp(arr, (np.min(arr), np.max(arr)), (minv, maxv))


## Preprocess Data

In [5]:
labelencode('era', training_data, tournament_data)
labelencode('data_type', training_data, tournament_data)

# store and scale target values
y_tr = training_data['target']
y_tr_int = (training_data['target'] * 4).astype(np.int32)

# store id for submission
id_val = tournament_data['id']

# isolate the features from the dataset
training_data.drop(['target', 'id'], axis=1, inplace=True)
df_tr_processed = preprocess(training_data)

tournament_data.drop(['target', 'id'], axis=1, inplace=True)
df_te_processed = preprocess(tournament_data)

# store data for catboost
training_data2 = training_data
tournament_data2 = tournament_data

## Define LightGBM Model

In [7]:
# lightGBM parameters
params_l = {
    'objective':'mse',
    'boosting_type':'gbrt',
    'metric':'mse',
    'device_type':'gpu',
    'max_depth': 10
}

N_EPOCH = 15
N_FOLD = 15
FEATURE_DROPOUT = 0.03

f_c = list(df_tr_processed.columns[df_tr_processed.columns.str.startswith('feature')])
f_l = list(df_tr_processed.columns[df_tr_processed.columns.str.startswith('feature')])
preds = []
for i in range(N_EPOCH):
  pred_l = np.zeros(len(tournament_data))
  folds = KFold(n_splits=N_FOLD)
  for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_tr_processed[f_l], y_tr_int)):
    tr_x, tr_y = df_tr_processed[f_l].iloc[trn_idx,:], y_tr_int[trn_idx]
    vl_x, vl_y = df_tr_processed[f_l].iloc[val_idx,:], y_tr_int[val_idx]

    print('EPOCH {}/{} | LGBM FOLD {}/{}'.format(i+1, N_EPOCH, fold_+1, N_FOLD))
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)  
    m_l = lgb.train(
        params_l,
        tr_data,
        valid_sets = [tr_data, vl_data],
        verbose_eval = 200,
    )

    # prediction within the fold
    pred_l += m_l.predict(df_te_processed[f_l])/N_FOLD

    # reversing the integer transformation
    pred_l *= 0.25

    # reduce features by importance
    f_imp = pd.DataFrame(sorted(zip(m_l.feature_importance(),
                                    df_tr_processed[f_l].columns)),
                        columns=['Value', 'Feature'])
    col_drop = int(len(f_imp) * FEATURE_DROPOUT)
    f_l = list(f_imp[col_drop:]['Feature'].values)

  # store predictions for epoch
  preds.append(pred_l)

# average and scale predictions
preds = np.mean(preds, axis=0)
preds = scale(preds, 0, 1)

EPOCH 1/10 | LGBM FOLD 1/10
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 451627, number of used features: 310
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 308 dense feature groups (67.19 MB) transferred to GPU in 0.080490 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 2.000027
EPOCH 1/10 | LGBM FOLD 2/10
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1505
[LightGBM] [Info] Number of data points in the train set: 451627, number of used features: 301
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics 620, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs hav

In [8]:
print(preds)

[0.54508255 0.57272139 0.76138928 ... 0.66742899 0.57047033 0.6628438 ]


In [None]:
## Create Submission Dataframe

In [10]:
# submission datafram
column = ['id','prediction']

df = pd.DataFrame(columns = column)
df['prediction'] = preds
df['id'] = id_val
df.head()

Unnamed: 0,id,prediction
0,n0003aa52cab36c2,0.545083
1,n000920ed083903f,0.572721
2,n0038e640522c4a6,0.761389
3,n004ac94a87dc54b,0.580801
4,n0052fe97ea0c05f,0.610046


## Submission and File Creation

In [11]:
df.to_csv('./predictions.csv', index=False)
submission_id = napi.upload_predictions('./predictions.csv')
print(submission_id)

2021-04-21 21:56:07,776 INFO numerapi.base_api: uploading predictions...
c3313104-83c9-411a-8b3d-03c7bcc111e5
