In this python script, I have done:
- EDA
    - Data collection
    - Checking null and inf in the data
    - Drop all the null data

- Visualization
    - Plot data adistribution
    - Plot candle stick
    - Plot volumn

- Model training
    - Xg-boosting
    - Loop over all assets (training)
    
- Model Prediction
    - Score: 0.3550
    - Ranking: 385/830


In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import time
import xgboost as xgb
import gresearch_crypto
import traceback
import keras

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, RepeatedKFold, KFold
from sklearn.metrics import mean_squared_error
from matplotlib.colors import to_rgba

## <center>**EDA**</center>

### **train.csv - Column Description**
- **timestamp**: All timestamps are returned as second Unix timestamps (the number of seconds elapsed since 1970-01-01 00:00:00.000 UTC). Timestamps in this dataset are multiple of 60, indicating minute-by-minute data.
- **Asset_ID**: The asset ID corresponding to one of the crypto currencies (e.g. Asset_ID = 1 for Bitcoin). The mapping from Asset_ID to crypto asset is contained in asset_details.csv.
- **Count**: Total number of trades in the time interval (last minute).
- **Open**: Opening price of the time interval (in USD).
- **High**: Highest price reached during time interval (in USD).
- **Low**: Lowest price reached during time interval (in USD).
- **Close**: Closing price of the time interval (in USD).
- **Volume**: Quantity of asset bought or sold, displayed in base currency USD.
- **VWAP**: The average price of the asset over the time interval, weighted by volume. VWAP is an aggregated form of trade data.
- **Target**: Residual log-returns for the asset over a 15 minute horizon.

In [6]:
# Read Data

def read_data(nrows=None):
    data = pd.read_csv('../input/g-research-crypto-forecasting/train.csv', nrows=nrows)
    asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
    return data, asset_details

### **Checking Nulls And Inf**

In [7]:
def check_null_and_inf(data):
    print("Numbers of Nulls in Data:")
    print(data.isnull().sum(), end='\n\n')
    print("Inf in Data:")
    print(np.where(np.isinf(data)==True)[0])

In [8]:
# Drop Infinite and Nan
def drop_inf_and_nan(data):
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0, inplace=True)
    data.isnull().sum()
    
    return data

## <center>**DataSet**</center>

### Hyperparameters

In [9]:
FEATURES = ['Count', 'Close','High', 'Low', 'Open', 'VWAP', 'Volume']
PARAMS = {
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': range(520, 600, 40),
    'learning_rate': [0.01, 0.03, 0.05],
    'max_depth': range(11, 14, 1),
}

In [10]:
def crypto_df(asset_id, data):
    df = data[data["Asset_ID"] == asset_id].set_index("timestamp")
    return df

In [11]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

In [12]:
def get_features(df):
    df_feat = df[FEATURES].copy()   
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)

    return df_feat

## <center>**Visualization**</center>

### **DATA DISTRIBUTION**

Training Data Distribution among differnet Assets (Crypto Currencies)

In [13]:
def plot_dis(data):
    asset_count = []
    for i in range(14):
        count = (data["Asset_ID"] == i).sum()
        asset_count.append(count)

    fig = sns.countplot(x="Asset_ID", data=data)
    fig.ticklabel_format(style='sci', axis='y')
    # fig.set_xticklabels(asset_details.sort_values("Asset_ID")["Asset_Name"].tolist(), rotation=-30, horizontalalignment='left')
    fig.set(xlabel='Assets', ylabel='Number of Rows')

### **CANDELSTICK CHARTS**

In [14]:
def candelstick_chart(data,title):
    candlestick = go.Figure(data = [go.Candlestick(x =data.index, 
                                               open = data[('Open')], 
                                               high = data[('High')], 
                                               low = data[('Low')], 
                                               close = data[('Close')])])
    candlestick.update_xaxes(title_text = 'Minutes',
                             rangeslider_visible = True)

    candlestick.update_layout(
    title = {
        'text': '{:} Candelstick Chart'.format(title),
        'y':0.90,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

    candlestick.update_yaxes(title_text = 'Price in USD', ticksuffix = '$')

    return candlestick

### **AREA PLOT**

In [15]:
# Volumn trade

def vol_traded(data):
    fig = plt.stackplot(data.index, data.Volume, color='thistle')
    return fig

## <center>**XG-Boosting**</center>

In [16]:
def xgb_cv(X, Y, params):
    data_dmatrix = xgb.DMatrix(data=X, label=Y)

    start_time = time.time()
    cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                        num_boost_round=50, early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
    end_time = time.time()
    run_time = end_time - start_time
    
    return cv_results, run_time

## <center>**Pipeline**</center>

### Data (Training, Testing)

In [17]:
data, asset_details = read_data()

In [18]:
data.head()

In [19]:
data.shape

In [20]:
asset_details

In [21]:
check_null_and_inf(data)

In [22]:
data = drop_inf_and_nan(data)

In [23]:
check_null_and_inf(data)

In [24]:
data['Asset_ID'].unique()

### Visualization

In [25]:
# Plot data adistribution
plot_dis(data)

In [26]:
btc = crypto_df(1, data)
eth = crypto_df (6, data)

In [27]:
# Plot candle stick

btc_plot = candelstick_chart(btc[-100:], "Bitcoin")
btc_plot.show()

In [28]:
# Plot volumn
vol_traded(btc[-50:])

### Model Training

In [57]:
def get_xgb_model(X, Y):
    estimator = xgb.XGBRegressor(
        n_estimators=500,
        max_depth=11,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.7,
        missing=-999,
        random_state=2020,
        tree_method='gpu_hist'
    )

    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, shuffle=False)
    start_time = time.time()
    estimator.fit(X_train, Y_train, eval_set=[(X_train,Y_train), (X_val, Y_val)], early_stopping_rounds=20)
    end_time = time.time()
    
    time_elapsed = round(end_time - start_time)
    
    return estimator, time_elapsed

In [30]:
def get_xgb_model_cv(X_train, Y_train):
    estimator = xgb.XGBRegressor(
        objective = "reg:squarederror",
        nthread = 4,
    )
    estimator.fit(X_train, Y_train)
    
    cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
    
    random_search = RandomizedSearchCV(
        estimator = estimator,
        param_distributions = PARAMS,
        n_jobs = -1,
        cv = cv,
        random_state=1
    )
    
    start_time = time.time()
    random_search.fit(X_train, Y_train)
    end_time = time.time()
    
    time_elapsed = round(end_time - start_time)
    best_estimator = random_search.best_estimator_.get_params()

    xgb_best = xgb.XGBRegressor(
        objective = "reg:squarederror",
        nthread = 4,
        colsample_bytree = best_estimator['colsample_bytree'],
        n_estimators = best_estimator['n_estimators'],
        learning_rate = best_estimator['learning_rate'],
        max_depth = best_estimator['max_depth'],
        subsample = 0.9,
        random_state = 1,
        missing = -999,
        tree_method='gpu_hist'
        )

    xgb_best.fit(X_train, Y_train)
    
    return xgb_best, time_elapsed

In [49]:
# Loop over all assets

def loop_over():
    Xs_train = {}
    ys_train = {}
    models = {}
    time_total = 0

    for asset_id, asset_name in zip(asset_details['Asset_ID'], asset_details['Asset_Name']):

        X_train, Y_train = data[data["Asset_ID"] == asset_id][FEATURES], data[data["Asset_ID"] == asset_id]['Target']
        X_train = get_features(X_train)
        Xs_train[asset_id], ys_train[asset_id] = X_train.reset_index(drop=True), Y_train.reset_index(drop=True)

#         print('Training model for "{}":'.format(asset_details[asset_details['Asset_ID'] == asset_id]['Asset_Name'].iloc[0]))
    #     models[asset_id] = get_xgb_model_cv(X_train.iloc[:10], Y_train.iloc[:10])
        print(asset_name, asset_id)
        model, time_elapsed = get_xgb_model(X_train, Y_train)
        models[asset_id] = model
#         print("Exicuted time: {} seconds.\n".format(time_elapsed))
        time_total += time_elapsed

#     print('Total time elapsed:', time_total)
    
    return Xs_train, ys_train, models


In [50]:
Xs_train, ys_train, models = loop_over()

In [56]:
for i in range(len(models)):
    results = models[i].evals_result()
    plt.figure(figsize=(10,7))
    plt.plot(results["validation_0"]["rmse"], label="Training loss")
    plt.plot(results["validation_1"]["rmse"], label="Validation loss")
    plt.xlabel("Iter")
    plt.ylabel("Loss")
    plt.legend()

fitting model

In [34]:
## Save the models

# import pickle

# for index, asset_id in enumerate(asset_details['Asset_ID']):
#     filename = str(asset_id) + '.pkl'
#     with open(filename, 'wb') as file:
#         pickle.dump(models[index], file)

In [35]:
# # Load the models

# load_models = {}

# for index, asset_id in enumerate(asset_details['Asset_ID']):
#     filename = str(asset_id) + '.pkl'
#     with open(filename, 'rb') as file:
#         load_models[index] = pickle.load(file)
        
# load_models

In [36]:
# for i in range(len(asset_details)):
#     predicted = models[i].predict(Xs_test[i])
#     print(mean_squared_error(ys_test[i], predicted))

In [37]:
### Submit

In [38]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
        
    env.predict(df_pred)
