In [20]:
%%capture

! pip install xgboost
! pip install catboost
! git clone --recursive https://github.com/Microsoft/LightGBM
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.


In [21]:
import numpy as np 
import pandas as pd 
import time
import lightgbm as lgb
import xgboost as xgb
import tensorflow as tf
import catboost as cat
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import os
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [22]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
train_df = pd.read_csv("../input/ai-guild/AI Fluency - Competition Dataset/AI Fluency - Training Data.csv")
test_df = pd.read_csv("../input/ai-guild/AI Fluency - Competition Dataset/AI Fluency - Testing Data.csv")

In [23]:
def log_diff(col, df):
    df['log_' + col] = 0
    eps = 1e-8
    for i in range(4):
        df['log_' + col][4+i::4] = np.diff(np.log(df[col][i::4] + eps))
    return df

In [24]:

def log_features(df):
    eps = 1e-8
    df['log_Open'] = np.log(df['Open'] + eps)
    df['log_High'] = np.log(df['High'] + eps)
    df['log_Low'] = np.log(df['Low'] + eps)
    df['log_Close'] = np.log(df['Close'] + eps)
    df['log_Volume'] = np.log(df['Volume'] + eps)
    df['log_VWAP'] = np.log(df['VWAP'] + eps)
    return df
train_df = log_features(train_df)
test_df = log_features(test_df)


In [25]:
data = train_df[[
                'timestamp', 'Asset_ID', 'Count', 'log_Open', 'log_High', 'log_Low', 'log_Close',
       'log_Volume', 'log_VWAP', 'Target', 'Index'
                ]]

test_data = test_df[[
                     'timestamp', 'Asset_ID', 'Count', 'log_Open', 'log_High', 'log_Low', 'log_Close',
       'log_Volume', 'log_VWAP', 'Index'
                    ]]


In [8]:
max_length = len(data)
training_cutoff = int(max_length * 0.95)
train_data = data.iloc[:training_cutoff]
valid_data = data.iloc[training_cutoff:]

In [26]:
lgb_model = lgb.LGBMRegressor(
    boosting = 'gbdt', 
    num_leaves=47, 
    max_depth=7, 
    learning_rate=0.006985117638031729, 
    feature_fraction=0.95,
    bagging_fraction=0.2,
    n_estimators=100,
    bagging_freq=10,
    verbose=0
    )
train_y = train_data['Target']
train_X = train_data.drop(columns=['Target'])

valid_y = valid_data['Target']
valid_X = valid_data.drop(columns=['Target'])

test_X = test_data

results = {}

In [29]:
lgb_model.fit(
            train_X,
            train_y,
            eval_set=(
                valid_X, valid_y
            ),
                
            early_stopping_rounds=100
        )

    
train_preds = lgb_model.predict(train_X)
valid_preds = lgb_model.predict(valid_X)
test_preds = lgb_model.predict(test_X)

train_rmse = mean_squared_error(train_y, train_preds, squared=False)
valid_rmse = mean_squared_error(valid_y, valid_preds, squared=False)
print('train rmse: {}, valid rmse: {}'.format(train_rmse, valid_rmse))




You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's l2: 0.000142348
[2]	valid_0's l2: 0.000142347
[3]	valid_0's l2: 0.000142346
[4]	valid_0's l2: 0.000142345
[5]	valid_0's l2: 0.000142345
[6]	valid_0's l2: 0.000142344
[7]	valid_0's l2: 0.000142344
[8]	valid_0's l2: 0.000142343
[9]	valid_0's l2: 0.000142342
[10]	valid_0's l2: 0.000142341
[11]	valid_0's l2: 0.000142325
[12]	valid_0's l2: 0.000142311
[13]	valid_0's l2: 0.000142299
[14]	valid_0's l2: 0.000142289
[15]	valid_0's l2: 0.00014228
[16]	valid_0's l2: 0.000142273
[17]	valid_0's l2: 0.000142267
[18]	valid_0's l2: 0.000142264
[19]	valid_0's l2: 0.000142261
[20]	valid_0's l2: 0.000142261
[21]	valid_0's l2: 0.000142259
[22]	valid_0's l2: 0.000142258
[23]	valid_0's l2: 0.000142257
[24]	valid_0's l2: 0.000142254
[25]	valid_0's l2: 0.000142253
[26]	valid_0's l2: 0.000142252
[27]	valid_0's l2: 0.00014225
[28]	valid_0's l2: 0.000142249
[29]	valid_0's l2: 0.000142247
[30]	valid_0's l2: 0.000142246
[31]	valid_0's l2: 0.00

In [31]:
from datetime import datetime
sub_df = test_df[['Index']].copy()
sub_df = sub_df.rename(columns={'Index': 'Index'})
sub_df['Target_PRED'] = test_preds
sub_df.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)