In [None]:
%pylab inline
%matplotlib inline

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylab

In [None]:
file_dir = "/l1/data/FBDQA2021A_MMP_Challenge_ver0.2/data"

## 读入单个文件并观察

In [None]:
sym = 5
date = 10

file_name = f"snapshot_sym{sym}_date{date}_am.csv"

In [None]:
df = pd.read_csv(os.path.join(file_dir,file_name))
df

# 检查异常值/缺失值

In [None]:
# 确定有无na值，若有要进行处理（是否一定能用0填充）
df.isnull().values.any()

In [None]:
# 确定有无inf值
cols = df.columns
for col in cols:
    # 如果不是object
    if not(df[col].dtypes == 'object'):
        # 判断是否有inf值，有则打印列名
        if not (np.all(np.isfinite(df[col].values))):
            print(col)

In [None]:
df.describe()

In [None]:
## 当数量级不一致时，很有可能影响后续计算结果
print(1e-10 + 1.0 == 1.0)
print(1e-20 + 1.0 == 1.0)
## 譬如上图的amount_delta

## 画图观察

In [None]:
df[['n_bid1','n_ask1','n_midprice']].plot(figsize = (16,4))


In [None]:
df[['n_bid2','n_bid3','n_ask2','n_ask3']].plot(figsize = (16,4))

In [None]:
_, ax1=plt.subplots(1,1,figsize=(16,5))
df[['n_bid1','n_ask1']].plot(ax=ax1,alpha=0.8)
ax1.set_yticks(np.arange(-0.05,0.05,0.01))       
ax1.set_ylabel('price') 
plt.legend(loc=2)

ax2 = ax1.twinx()
df[['label_5']].plot(ax=ax2,grid=True,alpha=0.4)
ax2.set_yticks(np.arange(0,5,1))
ax2.set_ylabel('label')    
plt.legend(loc=1)   

plt.title('price_label')

In [None]:
df['spread'] = df['n_ask1'] - df['n_bid1']
df['spread'].plot(figsize = (16,4))

In [None]:
_, ax1=plt.subplots(1,1,figsize=(16,5))
df[['spread']].plot(ax=ax1,alpha=0.8)
ax1.set_yticks(np.arange(-0.01,0.01,0.001))       
ax1.set_ylabel('spread') 
plt.legend(loc=2)

ax2 = ax1.twinx()
df[['label_5']].plot(ax=ax2,grid=True,alpha=0.4)
ax2.set_yticks(np.arange(0,5,1))
ax2.set_ylabel('label')    
plt.legend(loc=1)   

In [None]:
df['adj_label_5'] = df['label_5'].copy()
df.loc[df['label_5']==2,'adj_label_5'] = 0

In [None]:
_, ax1=plt.subplots(1,1,figsize=(16,5))
df[['spread']].plot(ax=ax1,alpha=0.8)
ax1.set_yticks(np.arange(-0.01,0.01,0.001))       
ax1.set_ylabel('spread') 
plt.legend(loc=2)

ax2 = ax1.twinx()
df[['adj_label_5']].plot(ax=ax2,grid=True,alpha=0.4)
ax2.set_yticks(np.arange(0,4,1))
ax2.set_ylabel('label')    
plt.legend(loc=1)   

## size变化过大不好处理：log

In [None]:
df[['n_bsize1','n_asize1']].describe()

In [None]:
# 写个for循环更好，这里只是为了表达清晰
df['bsize1'] = df['n_bsize1'].map(np.log1p)    # 为什么要用log1p？
df['bsize2'] = df['n_bsize2'].map(np.log1p)
df['bsize3'] = df['n_bsize3'].map(np.log1p)
df['bsize4'] = df['n_bsize4'].map(np.log1p)
df['bsize5'] = df['n_bsize5'].map(np.log1p)
df['asize1'] = df['n_asize1'].map(np.log1p)
df['asize2'] = df['n_asize2'].map(np.log1p)
df['asize3'] = df['n_asize3'].map(np.log1p)
df['asize4'] = df['n_asize4'].map(np.log1p)
df['asize5'] = df['n_asize5'].map(np.log1p)
df['amount'] = df['amount_delta'].map(np.log1p)

df[['bsize1','n_bsize1','bsize2','n_bsize1','asize1','n_asize1','asize2','n_asize2','amount','amount_delta']].describe()


# 查看取对数后的数据分布

In [None]:
_, ax1=plt.subplots(1,1,figsize=(16,5))
df[['n_bid1','n_ask1','n_close']].plot(ax=ax1,alpha=0.8)
ax1.set_yticks(np.arange(-0.05,0.05,0.01))       
ax1.set_ylabel('price') 
plt.legend(loc=2)

ax2 = ax1.twinx()
df[['bsize1','asize1']].plot(ax=ax2,grid=True,alpha=0.4)
ax2.set_yticks(np.arange(-16,0,1))
ax2.set_ylabel('vol')    
plt.legend(loc=1)   

plt.title('price_vol')
# plt.savefig('price_vol.png', dpi=400, bbox_inches='tight')     