In [None]:
import pandas as pd
import numpy as np
import torch
import os


In [None]:
# Renaming

# tic_list = [tic[:-4] for tic in os.listdir("./CSI")]
# for i in range(len(tic_list)):
#     data = pd.read_csv(f"./CSI/{tic_list[i]}.csv", index_col=0)
#     os.remove(f"./CSI/{tic_list[i]}.csv")
#     if tic_list[i][-2:] == "SS":
#         mark = "SH"
#     else:
#         mark = "SZ"
#     tic_list[i] = mark + tic_list[i][:6]
#     data.to_csv(f"./CSI/{tic_list[i]}.csv")
   

In [None]:
import qlib
from qlib.config import REG_CN
from qlib.contrib.data.handler import Alpha158

qlib.init(provider_uri="~/.qlib/CSI_data", region=REG_CN)
data_handler_config = {
    "start_time": "2010-01-01",
    "end_time": "2022-06-01",
    "instruments": "all",
}
h = Alpha158(**data_handler_config)
data_df = h.fetch()


In [None]:
feature_df = data_df.reset_index().dropna(axis=1, how="all").rename(
    columns={"datetime": "date", "instrument": "tic"}
)
feature_df


In [None]:
# Strange features that have different NaN data among different stocks.

drop_col = []
for k,v in feature_df.isna().sum().items():
    if v % 100 != 0:
        print(k, v)
        drop_col.append(k)
            

In [None]:
feature_df = feature_df.drop(columns=drop_col)

In [None]:
feature_df = feature_df.dropna()

In [None]:
for tic in feature_df['tic'].unique():
    print(len(feature_df[feature_df['tic'] == tic]))

In [None]:
alpha158 = feature_df.columns[2:].to_list() # alpha158
basic_feature = ["open", "close", "high", "low", "volume"]
target_return_span = 5
target = f"return+{target_return_span}"

In [None]:
time_span = 60

CSI_date = ['20110419', '20181228', '20180102', '20201231',  '20190402', '20211231']

In [None]:
tic_df_list = []
for tic_path in os.listdir("./CSI/"):
    tic_df = pd.read_csv(f"./CSI/{tic_path}", index_col=0)[["date", "open", "close", "high", "low", "volume"]]
    tic_df["tic"] = tic_path[:8]
    tic_df[target] = tic_df.close.pct_change(target_return_span).shift(-1 * target_return_span)
    tic_df_list.append(tic_df)

tic_target_df = pd.concat(tic_df_list).sort_values(by="date").dropna()
tic_target_df.date = pd.DatetimeIndex(tic_target_df.date)


In [None]:
tic_target_df

In [None]:
dataset_df = feature_df.merge(tic_target_df, how='inner', on=["date", "tic"])
dataset_df

In [None]:
dataset_df.info()

In [None]:
train_df = dataset_df[(dataset_df.date >= CSI_date[0]) & (dataset_df.date <= CSI_date[1])]
val_df = dataset_df[(dataset_df.date >= CSI_date[2]) & (dataset_df.date <= CSI_date[3])]
test_df = dataset_df[(dataset_df.date >= CSI_date[4]) & (dataset_df.date <= CSI_date[5])]

In [None]:
# tic_list = dataset_df.tic.unique()

# with open("tic_list.txt", "w") as f:
#     for tic in tic_list:
#         f.write(tic + "\n")

In [None]:
with open("tic_list.txt", "r") as f:
    tic_list = [tic.strip("\n") for tic in f.readlines()]

In [None]:
import torch

In [None]:
def df_2_array(dataset_df, feat_col, target, type):
    dataset_feat = []
    dataset_ret = []
    dataset_price = []
    for tic in tic_list:
        df = dataset_df[dataset_df.tic == tic]
        feat = df[feat_col].to_numpy()
        ret = df[target].to_numpy()
        price = df['close'].to_numpy()
        stock_feat = []
        stock_ret = []
        stock_price = []
        for i in range(time_span, feat.shape[0]):
            stock_feat.append(feat[i-time_span : i])
            stock_ret.append(ret[i])
            stock_price.append(price[i])
        stock_feat = np.array(stock_feat)
        stock_ret = np.array(stock_ret)
        stock_price = np.array(stock_price)
    
        dataset_feat.append(stock_feat)
        dataset_ret.append(stock_ret)
        dataset_price.append(stock_price)

    dataset_feat = np.array(dataset_feat).transpose((1, 2, 0, 3))
    dataset_ret = np.array(dataset_ret).transpose((1, 0))
    dataset_price = np.array(dataset_price).transpose((1, 0))
    
    dataset_feat_tensor = torch.tensor(dataset_feat, dtype=torch.float)
    dataset_ret_tensor = torch.tensor(dataset_ret, dtype=torch.float)
    dataset_price_tensor = torch.tensor(dataset_price, dtype=torch.float)
    
    torch.save(dataset_feat_tensor, f"./dataset/alpha/{type}/feat.pt")
    torch.save(dataset_ret_tensor, f"./dataset/alpha/{type}/ret.pt")
    torch.save(dataset_price_tensor, f"./dataset/alpha/{type}/price.pt")    
    
    return dataset_feat, dataset_ret, dataset_price

In [None]:
_ = df_2_array(train_df, alpha158, target, "train")
_ = df_2_array(val_df, alpha158, target, "val")
dataset_feat, dataset_ret, dataset_price = df_2_array(test_df, alpha158, target, "test")


In [None]:
dataset_feat.shape

In [None]:
dataset_ret.shape