In [47]:
import pandas as pd
import numpy as np
import plotly.express as px
import talib as ta

# Reading train data
BTCUSDT_train = pd.read_csv('data/BTCUSDT.csv')
USDIRT_train = pd.read_csv('data/USDIRT.csv')
Wallex_USDIRT_train = pd.read_csv('data/Wallex_USDIRT.csv')

In [48]:
# Reading test data
BTCUSDT_test = pd.read_csv('./data/BTC_TEST.csv')
USDIRT_test = pd.read_csv('./data/DOLLAR_TEST.csv')
Wallex_USDIRT_test = pd.read_csv('./data/TETHER_TEST.csv')
USDIRT_test.rename(columns={'OPEN':'Open', 'HIGH':'High', 'CLOSE':'Close', 'LOW':'Low'}, inplace=True)

In [49]:
# Cleaning train data
USDIRT_train.rename(columns={'OPEN':'Open', 'HIGH':'High', 'CLOSE':'Close', 'LOW':'Low'}, inplace=True)
BTCUSDT_train.drop(columns='Unnamed: 0', inplace=True)
USDIRT_train.drop(columns='Unnamed: 0', inplace=True)
Wallex_USDIRT_train.drop(columns='Unnamed: 0', inplace=True)
BTCUSDT_train['Close'].fillna(BTCUSDT_train['Close'].mean(), inplace=True)
USDIRT_train.drop(columns='VOL', inplace=True)
USDIRT_train['Close'].fillna(USDIRT_train['Close'].mean(), inplace=True)
Wallex_USDIRT_train['Volume'].replace({'?': np.NAN}, inplace=True)
Wallex_USDIRT_train['Volume'] = Wallex_USDIRT_train['Volume'].astype(float)
Wallex_USDIRT_train['Volume'].fillna(Wallex_USDIRT_train['Volume'].mean(), inplace=True)

Creating features for train and test

In [50]:
features_train = pd.DataFrame(columns=['BTCUSDT_CCI', 'USDIRT_CCI', 'Wallex_USDIRT_CCI','BTCUSDT_RSI', 'USDIRT_RSI',
                            'Wallex_USDIRT_RSI','BTCUSDT_MACD', 'USDIRT_MACD', 'Wallex_USDIRT_MACD',
                            'Wallex_USDIRT_Avg_Volume_hour', 'Wallex_USDIRT_return','Tether/Dollar_close','Tether-Dollar/Tether_close','label' ])

In [51]:
features_test = pd.DataFrame(columns=['BTCUSDT_CCI', 'USDIRT_CCI', 'Wallex_USDIRT_CCI','BTCUSDT_RSI', 'USDIRT_RSI',
                            'Wallex_USDIRT_RSI','BTCUSDT_MACD', 'USDIRT_MACD', 'Wallex_USDIRT_MACD',
                            'Wallex_USDIRT_Avg_Volume_hour', 'Wallex_USDIRT_return','Tether/Dollar_close','Tether-Dollar/Tether_close','label' ])

In [52]:
def CCI(table, column, features):
    df = pd.DataFrame(columns=['TP', 'sma', 'mad'])
    df['TP'] = (table['High'] + table['Low'] + table['Close']) / 3 
    df['sma'] = df['TP'].rolling(20).mean()
    df['mad'] = df['TP'].rolling(20).apply(lambda x: pd.Series(x).mad())
    features[column] = (df['TP'] - df['sma']) / (0.015 * df['mad'])
    features[column].fillna(features[column].mean(), inplace=True)

def RSI(table, column, features):
    features[column] = ta.RSI(table['Close'], timeperiod=20)
    features[column].fillna(features[column].mean(), inplace=True)
    
def MACD(table, column, features):
    k = table['Close'].ewm(span=12, adjust=False, min_periods=12).mean()
    d = table['Close'].ewm(span=26, adjust=False, min_periods=26).mean()
    macd = k - d
    features[column] = table.index.map(macd)
    features[column].fillna(features[column].mean(), inplace=True)

In [53]:
CCI(BTCUSDT_train, 'BTCUSDT_CCI', features_train)
CCI(USDIRT_train, 'USDIRT_CCI', features_train)
CCI(Wallex_USDIRT_train, 'Wallex_USDIRT_CCI', features_train)


RSI(BTCUSDT_train, 'BTCUSDT_RSI', features_train)
RSI(USDIRT_train, 'USDIRT_RSI', features_train)
RSI(Wallex_USDIRT_train, 'Wallex_USDIRT_RSI', features_train)


MACD(BTCUSDT_train, 'BTCUSDT_MACD', features_train)
MACD(USDIRT_train, 'USDIRT_MACD', features_train)
MACD(Wallex_USDIRT_train, 'Wallex_USDIRT_MACD', features_train)

In [54]:
CCI(BTCUSDT_test, 'BTCUSDT_CCI', features_test)
CCI(USDIRT_test, 'USDIRT_CCI', features_test)
CCI(Wallex_USDIRT_test, 'Wallex_USDIRT_CCI', features_test)

RSI(BTCUSDT_test, 'BTCUSDT_RSI', features_test)
RSI(USDIRT_test, 'USDIRT_RSI', features_test)
RSI(Wallex_USDIRT_test, 'Wallex_USDIRT_RSI', features_test)

MACD(BTCUSDT_test, 'BTCUSDT_MACD', features_test)
MACD(USDIRT_test, 'USDIRT_MACD', features_test)
MACD(Wallex_USDIRT_test, 'Wallex_USDIRT_MACD', features_test)

In [55]:
# The average volume of Tether transactions in the past hour
features_train['Wallex_USDIRT_Avg_Volume_hour'] = Wallex_USDIRT_train['Volume'].rolling(61).mean()
features_train['Wallex_USDIRT_Avg_Volume_hour'].fillna(features_train['Wallex_USDIRT_Avg_Volume_hour'].mean(), inplace=True)

In [56]:
features_test['Wallex_USDIRT_Avg_Volume_hour'] = Wallex_USDIRT_test['Volume'].rolling(61).mean()
features_test['Wallex_USDIRT_Avg_Volume_hour'].fillna(features_test['Wallex_USDIRT_Avg_Volume_hour'].mean(), inplace=True)

In [57]:
# Tether's return in a min
features_train['Wallex_USDIRT_return'] = Wallex_USDIRT_train['Close'] - Wallex_USDIRT_train['Open']

In [58]:
features_test['Wallex_USDIRT_return'] = Wallex_USDIRT_test['Close'] - Wallex_USDIRT_test['Open']

In [59]:
# Tether/Dollar_close
features_train['Tether/Dollar_close'] = Wallex_USDIRT_train['Close'] / USDIRT_train['Close']

In [60]:
features_test['Tether/Dollar_close'] = Wallex_USDIRT_test['Close'] / USDIRT_test['Close']

In [61]:
# Tether-Dollar/Tether_close
features_train['Tether-Dollar/Tether_close'] = ( Wallex_USDIRT_train['Close'] - USDIRT_train['Close'] ) / Wallex_USDIRT_train['Close']

In [62]:
features_test['Tether-Dollar/Tether_close'] = ( Wallex_USDIRT_test['Close'] - USDIRT_test['Close'] ) / Wallex_USDIRT_test['Close']

Making labels

In [63]:
# for train
label = []
for i in range(0, Wallex_USDIRT_train.shape[0] - 1):
    if Wallex_USDIRT_train.loc[i, 'Close'] < Wallex_USDIRT_train.loc[i + 1, 'Close']:
        label.append('1')
    else:
        label.append('0')
label.append('0')
features_train['label'] = label

In [64]:
# for test
label = []
for i in range(0, Wallex_USDIRT_test.shape[0] - 1):
    if Wallex_USDIRT_test.loc[i, 'Close'] < Wallex_USDIRT_test.loc[i + 1, 'Close']:
        label.append('1')
    else:
        label.append('0')
label.append('0')
features_test['label'] = label

resampling train data

In [65]:
label_z = features_train[features_train['label'] == '0']
label_o = features_train[features_train['label'] == '1']
label_o_upsampled = label_o.sample(n=len(label_z), replace=True ,  random_state=42)
data_upsampled = pd.concat([label_z, label_o_upsampled])
data_upsampled.reset_index(drop=True, inplace=True)

In [66]:
ndata = data_upsampled.sample(frac=1 , random_state=42)
ndata.reset_index(drop=True, inplace=True)
ndata

Unnamed: 0,BTCUSDT_CCI,USDIRT_CCI,Wallex_USDIRT_CCI,BTCUSDT_RSI,USDIRT_RSI,Wallex_USDIRT_RSI,BTCUSDT_MACD,USDIRT_MACD,Wallex_USDIRT_MACD,Wallex_USDIRT_Avg_Volume_hour,Wallex_USDIRT_return,Tether/Dollar_close,Tether-Dollar/Tether_close,label
0,-30.433935,33.470833,30.839002,48.829822,51.149557,52.587936,-224.750504,61.730238,-1.914280,848.537358,87.0,1.012176,0.012030,0
1,-37.785282,72.332326,94.316443,48.257480,55.712026,65.639597,-467.995076,77.416644,53.604659,754.979490,0.0,1.010890,0.010773,0
2,105.275459,-42.553191,72.124756,51.340059,48.784788,49.511595,40.167613,-32.022492,1.077380,644.965710,-11.0,0.996354,-0.003660,0
3,-90.680869,44.581742,213.668061,48.400133,51.456617,59.528241,-38.966598,-87.078546,3.438470,748.582750,1.0,1.001430,0.001428,1
4,-115.376381,-319.334125,75.357050,47.950879,39.038687,51.965857,-125.060754,-44.042447,10.162833,3987.114033,7.0,1.084381,0.077815,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88115,38.202101,6.469238,19.223431,51.367495,49.461434,51.283619,-250.366735,-4.171027,-2.648667,1155.302614,0.0,0.994508,-0.005522,0
88116,27.384000,-137.131614,-31.731709,51.018146,47.954546,49.048525,107.220036,-26.770554,-0.312086,1249.525391,0.0,0.999275,-0.000725,1
88117,63.562755,363.540011,-66.921050,51.760455,52.933400,42.640841,320.128616,2.923626,-19.502695,665.034881,0.0,0.995485,-0.004536,1
88118,-43.102028,-36.430484,58.041838,48.726482,48.003338,54.389610,-32.699030,61.903864,37.380920,1753.221766,-16.0,1.020464,0.020054,0


In [67]:
X_train = ndata.drop(columns=['label'])
y_train = ndata['label']

In [68]:
X_test = features_test.drop(columns=['label'])
y_test = features_test['label']

Scaling data

In [75]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [76]:
X_test_scaled = scaler.transform(X_test)

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RandomForestClassifier(), X_train_scaled, y_train, cv=5)
print("CV score of Random forest Classifier: " , scores.mean())

CV score of Random forest Classifier:  0.8361892873354517


In [78]:
model = RandomForestClassifier(max_depth= 40, 
                               min_samples_leaf= 4, 
                               n_estimators= 1000, 
                               min_samples_split=6,
                               bootstrap=False)
model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=False, max_depth=40, min_samples_leaf=4,
                       min_samples_split=6, n_estimators=1000)

In [79]:
from sklearn.metrics import classification_report
pred = model.predict(X_test_scaled)
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.87      0.51      0.64      1169
           1       0.15      0.52      0.23       193

    accuracy                           0.51      1362
   macro avg       0.51      0.51      0.44      1362
weighted avg       0.76      0.51      0.58      1362



In [None]:
output = pd.Series(pred, index=Wallex_USDIRT_test.loc['Datetime'])
output