# DRW - Crypto Market Prediction : Test Raw Data Preprocessing Notebook

### Dependencies

In [1]:
import os 
import json
import random
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

In [2]:
test_df = pd.read_parquet(os.path.join("Data","test.parquet"))
test_df.drop(columns = ["bid_qty",
                        "ask_qty",
                        "sell_qty",
                        "volume",
                        "buy_qty",
                        "label"],
                        inplace = True)
test_df.head()

Unnamed: 0_level_0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X881,X882,X883,X884,X885,X886,X887,X888,X889,X890
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.732818,0.512331,-0.041982,-0.59826,-0.517646,-0.720098,-0.76901,-0.780178,-0.3365,0.249788,...,0.43329,1.014336,1.367567,1.584126,1.584126,1.584126,1.584125,1.567979,1.426368,2.192682
2,-0.337995,-0.412176,-0.259468,-0.334809,-0.247443,-0.243987,-0.214849,-0.224255,-0.539625,-0.315144,...,0.725448,1.748939,1.848177,0.000131,0.000323,0.001746,0.007319,0.050925,0.142206,0.739759
3,0.111249,0.458221,0.466916,0.574081,0.324722,0.291298,0.295451,0.324332,-0.007327,0.351338,...,0.743693,1.70468,1.772028,0.00055,0.003597,0.009265,0.019948,0.072535,0.169672,0.807862
4,-0.149399,-0.640638,-0.873778,-1.026144,-0.508816,-0.318499,-0.27099,-0.26999,-0.206264,-0.491395,...,0.451041,0.930946,1.037839,1.382037,1.382037,1.382037,1.382037,1.381752,1.363445,2.465509
5,-0.694662,0.611254,0.067671,-0.531632,-0.58145,-0.670998,-0.658519,-0.641033,-0.709044,0.522476,...,0.602724,1.080267,1.141878,0.000363,0.012718,0.137533,0.464112,1.050577,1.268567,2.499015


### Dimensional Reduction

In [3]:
reg = joblib.load(os.path.join("Models","dim-rd1.bin"))
reg

In [4]:
dr_features = reg.predict(test_df.values)
dr_features

array([-0.23247643,  0.45780745,  0.05881966, ..., -0.24483939,
       -0.4647001 ,  3.9344232 ], dtype=float32)

In [5]:
test_df = pd.read_parquet(os.path.join("Data","test.parquet"))

new_test_df = pd.DataFrame({"dr_features":dr_features,
                             "label":test_df["label"],
                             "bid_qty":test_df["bid_qty"],
                             "ask_qty":test_df["ask_qty"],
                             "sell_qty":test_df["sell_qty"],
                             "volume":test_df["volume"],
                             "buy_qty":test_df["buy_qty"]})
new_test_df.head()

Unnamed: 0_level_0,dr_features,label,bid_qty,ask_qty,sell_qty,volume,buy_qty
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.232476,0.0,0.114,12.121,10.971,21.558,10.587
2,0.457807,0.0,2.426,2.962,12.304,148.545,136.241
3,0.05882,0.0,1.085,2.343,57.171,80.561,23.39
4,-0.325016,0.0,14.793,1.117,13.082,129.6,116.518
5,-0.607323,0.0,0.033,14.178,49.836,93.636,43.8


### Preparing the Data

In [16]:
records = []
for _,row in new_test_df.iterrows():
    
    _instruction_ = "You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info."
    _input_ = f'Asset: BTC\nSignal: {row["dr_features"].astype("str")},\nBid Quantity: {row["bid_qty"].astype("str")},\nAsk Quantity: {row["ask_qty"].astype("str")},\nSell Quantity:{row["sell_qty"].astype("str")},\nVolume: {row["volume"].astype("str")},\nBuy Quantity: {row["buy_qty"].astype("str")}'
    _output_ = row["label"].astype("str")
    
    records.append({"text":f"{_instruction_},Input: {_input_}"})
    
random.shuffle(records)
records[:5]

[{'text': 'You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info.,Input: Asset: BTC\nSignal: -0.09824385493993759,\nBid Quantity: 0.033,\nAsk Quantity: 1.69,\nSell Quantity:258.487,\nVolume: 399.839,\nBuy Quantity: 141.352'},
 {'text': 'You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info.,Input: Asset: BTC\nSignal: -0.0637592077255249,\nBid Quantity: 5.874,\nAsk Quantity: 12.794,\nSell Quantity:102.785,\nVolume: 206.788,\nBuy Quantity: 104.003'},
 {'text': 'You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info.,Input: Asset: BTC\nSignal: -0.03254402428865433,\nBid Quantity: 28.462,\nAsk Quantity: 1.373,\nSell Quantity:35.832,\nVolume: 148.018,\nBuy Quantity: 112.186'},
 {'text': 'You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info.,Input: Asset: BTC\nSignal: 1.2404043674468994,\nBid Quantity: 2.511,\nAsk Quan

In [17]:
len(records)

538150

In [18]:
with open(os.path.join("Finegrained","test.jsonl"),"w") as f:
    for record in records:
        f.write(json.dumps(record) + "\n")
