# DRW - Crypto Market Prediction : Raw Data Preprocessing Notebook

### Dependencies

In [1]:
import os 
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Preparing the Data

In [2]:
train_df = pd.read_csv(os.path.join("Data","preprocessed_train_df.csv"))
train_df.drop(columns=["timestamp"],inplace=True)
train_df.head()

Unnamed: 0,dr_features,label,bid_qty,ask_qty,sell_qty,volume,buy_qty
0,1.067519,0.562539,15.283,8.425,44.984,221.389,176.405
1,1.131456,0.533686,38.59,2.336,321.95,847.796,525.846
2,0.991776,0.546505,0.442,60.25,136.369,295.596,159.227
3,0.836962,0.357703,4.865,21.016,124.963,460.705,335.742
4,0.785671,0.362452,27.158,3.451,44.407,142.818,98.411


In [3]:
train_df.corr()

Unnamed: 0,dr_features,label,bid_qty,ask_qty,sell_qty,volume,buy_qty
dr_features,1.0,0.961725,-0.012643,-0.019452,0.011547,0.010001,0.007516
label,0.961725,1.0,-0.01322,-0.015762,0.011166,0.008809,0.005618
bid_qty,-0.012643,-0.01322,1.0,0.015471,-0.064913,-0.06591,-0.060813
ask_qty,-0.019452,-0.015762,0.015471,1.0,-0.05643,-0.062442,-0.062721
sell_qty,0.011547,0.011166,-0.064913,-0.05643,1.0,0.954279,0.819819
volume,0.010001,0.008809,-0.06591,-0.062442,0.954279,1.0,0.953503
buy_qty,0.007516,0.005618,-0.060813,-0.062721,0.819819,0.953503,1.0


In [4]:
records = []
for _,row in train_df.iterrows():
    
    _instruction_ = "You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info."
    _input_ = f'Asset: BTC\nSignal: {row["dr_features"]},\nBid Quantity: {row["bid_qty"]},\nAsk Quantity: {row["ask_qty"]},\nSell Quantity:{row["sell_qty"]},\nVolume: {row["volume"]},\nBuy Quantity: {row["buy_qty"]}'
    _output_ = row["label"].astype("str")
    
    records.append({"instruction":_instruction_,
                    "input":_input_,
                    "output":_output_})
    
random.shuffle(records)
records[:5]

[{'instruction': 'You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info.',
  'input': 'Asset: BTC\nSignal: -0.075171955,\nBid Quantity: 29.991,\nAsk Quantity: 36.123,\nSell Quantity:40.925,\nVolume: 67.006,\nBuy Quantity: 26.081',
  'output': '0.0730768678187458'},
 {'instruction': 'You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info.',
  'input': 'Asset: BTC\nSignal: 0.1587267,\nBid Quantity: 31.043,\nAsk Quantity: 25.036,\nSell Quantity:5.972,\nVolume: 11.995,\nBuy Quantity: 6.023',
  'output': '0.2189587749863983'},
 {'instruction': 'You are a Bitcoin Market Expert.Predict the anonymized market price movement using the given info.',
  'input': 'Asset: BTC\nSignal: 0.25785226,\nBid Quantity: 2.707,\nAsk Quantity: 9.176,\nSell Quantity:21.457,\nVolume: 36.234,\nBuy Quantity: 14.777',
  'output': '0.5090507766130441'},
 {'instruction': 'You are a Bitcoin Market Expert.Predict the anonymized market

In [5]:
len(records)

525887

In [6]:
val_split = records[:100000]
train_split = records[100000:]

print(f"train: {len(train_split)}")
print(f"val: {len(val_split)}")

train: 425887
val: 100000


In [8]:
with open(os.path.join("Data","train.jsonl"),"w") as f:
    for record in train_split:
        f.write(json.dumps(record) + "\n")

with open(os.path.join("Data","valid.jsonl"),"w") as f:
    for record in val_split:
        f.write(json.dumps(record) + "\n")
