In [None]:
!pip install tushare # -i https://opentuna.cn/pypi/web/simple

In [None]:
import tushare as ts

import datetime
from datetime import date
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import json

import boto3
import sagemaker

In [None]:
ts.set_token('1a1754d406d84b97ebb678b3cae9bfe3cbfaf4c0770f5409ae6e03b5')

pro = ts.pro_api()

In [None]:
#查询当前所有正常上市交易的股票列表

data = pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,name,area,industry,list_date')

In [None]:
data['list_date'] = pd.to_datetime(data['list_date'], format='%Y%m%d')

In [None]:
data.shape

In [None]:
origin_data = data.copy()

In [None]:
origin_data.to_csv('stock_basic.csv', index=False)

In [None]:
data = data[:10]

In [None]:
data.head()

In [None]:
start_time = '20200101'
end_time = date.today().strftime("%Y%m%d")

In [None]:
alldata = None

def get_daily(ts_code, start_date, end_date):
    global alldata
    df = pro.daily(ts_code=ts_code, start_date=start_date, end_date=end_date)
    if alldata is None:
        alldata = df
    else:
        alldata = pd.concat((alldata, df), axis=0)

_ = data['ts_code'].apply(lambda x: get_daily(x, start_time, end_time))

In [None]:
alldata['trade_date'] = pd.to_datetime(alldata['trade_date'], format='%Y%m%d')

In [None]:
def get_list_day(ts_code, trade_date):
    list_date = data[data['ts_code'] == ts_code]['list_date']
    if list_date.shape[0] > 0:
        list_date = list_date.iloc[0]
    else:
        return None
    list_day = (trade_date-list_date).days
    return list_day

alldata['list_day'] = alldata.apply(lambda x: get_list_day(x['ts_code'], x['trade_date']), axis=1)

In [None]:
alldata.shape

In [None]:
alldata.head()

In [None]:
alldata.describe()

In [None]:
alldata.to_csv(start_time+'_'+end_time+'.csv', index=False)

In [None]:
freq = '1D'
prediction_length = 7
context_length = 365

id_feature = 'ts_code'
label_feature = 'close'
time_feature = 'trade_date'
sparse_features = ['area', 'industry']
dynamic_dense_features = ['list_day']

start_time = alldata[time_feature].min()
end_time = alldata[time_feature].max()
print('start_time:', start_time)
print('end_time:', end_time)

In [None]:
for sparse_feature in sparse_features:
    print(sparse_feature+':', len(data[sparse_feature].unique()), data[sparse_feature].unique()[:5], '... na:', sum(data[sparse_feature].isna()))

In [None]:
%%time

ids = []
data_group = alldata.groupby(id_feature)
cnt = 0
for name, group in data_group:
    if cnt % 1000 == 0:
        print('cnt:', cnt)
    cnt += 1
    # print(name)
    # print(group)
    new_name = str(name)
    # print(new_name)
    ids.append(new_name)

num_timeseries = len(ids)
print('num_timeseries:', num_timeseries)

In [None]:
ids

In [None]:
def get_timeseries(df, dense_feature):
    df_group = df.groupby(id_feature)
    dense_df = pd.DataFrame({time_feature: [start_time, end_time]})
    dense_df.set_index(time_feature, inplace=True)
    dense_df = dense_df.resample(freq).asfreq()
    # print(dense_df)
    for name, group in df_group:
#         print(name)
        tmp_df = pd.DataFrame({name: group[dense_feature], time_feature:group[time_feature]})
        tmp_df.set_index(time_feature, inplace=True)
        if dense_feature == label_feature:
            tmp_df = tmp_df.resample(freq).sum()  # aggregate
        else:
            tmp_df = tmp_df.resample(freq).mean()  # aggregate
        # print(tmp_df)
        dense_df = dense_df.join(tmp_df)
    if dense_feature == label_feature:
        dense_df = dense_df.resample(freq).sum()  # aggregate
    else:
        dense_df = dense_df.resample(freq).mean()  # aggregate
        # TODO fill NaN
        dense_df = dense_df.replace([np.inf, -np.inf], np.nan)
        dense_df.fillna(method='ffill', inplace=True)
        dense_df.fillna(method='bfill', inplace=True)
        dense_df.fillna(0, inplace=True)
    print('dense_df.shape:', dense_df.shape)
    
    timeseries = []
    for i in range(num_timeseries):
        dfi = dense_df.iloc[:,i]
        timeseries.append(dfi)
    # print(timeseries)
    return timeseries

In [None]:
timeseries = get_timeseries(alldata, label_feature)

In [None]:
dynamic_dense_timeseries = []
for dense_feature in dynamic_dense_features:
    print(dense_feature)
    dense_timeseries = get_timeseries(alldata, dense_feature)
    dynamic_dense_timeseries.append(dense_timeseries)

In [None]:
property_cats = []

ids_df = pd.DataFrame({id_feature: ids})

for sparse_feature in sparse_features:
    le = LabelEncoder()
    new_data = ids_df.merge(data, how='left', on=id_feature)
#     print(new_data)
    features_arr = le.fit_transform(new_data[sparse_feature])
    property_cats.append(features_arr.tolist())
    le_classes = le.classes_.tolist()
    print(sparse_feature, 'features_arr:', len(le_classes))
    pickle.dump(le, open((sparse_feature+'_le.pickle').replace('/', '_'), 'wb'))

In [None]:
property_cats

In [None]:
DATETIME_START_OF_TRAIN = start_time
DATETIME_END_OF_TRAIN = end_time+datetime.timedelta(days=1)-datetime.timedelta(days=2*prediction_length)
DATETIME_START_OF_TEST = DATETIME_END_OF_TRAIN
DATETIME_END_OF_TEST = end_time+datetime.timedelta(days=1)-datetime.timedelta(days=prediction_length)
DATETIME_START_OF_PREDICT = DATETIME_END_OF_TEST
DATETIME_END_OF_PREDICT = end_time+datetime.timedelta(days=1)

In [None]:
start_dataset = pd.Timestamp(DATETIME_START_OF_TRAIN, freq=freq)
end_training = pd.Timestamp(DATETIME_END_OF_TRAIN, freq=freq)
start_test = pd.Timestamp(DATETIME_START_OF_TEST, freq=freq)
end_test = pd.Timestamp(DATETIME_END_OF_TEST, freq=freq)
start_predict = pd.Timestamp(DATETIME_START_OF_PREDICT, freq=freq)
end_predict = pd.Timestamp(DATETIME_END_OF_PREDICT, freq=freq)
print('start_dataset:', start_dataset)
print('end_training:', end_training)
print('start_test:', start_test)
print('end_test:', end_test)
print('start_predict:', start_predict)
print('end_predict:', end_predict)

In [None]:
training_data = [
    {
        "start": str(timeseries[i].index[0]),
        "target": timeseries[i][start_dataset:end_training][:-1].tolist(),  # We use -1, because pandas indexing includes the upper bound 
        "dynamic_feat": [dense_timeseries[i][start_dataset:end_training][:-1].tolist() for dense_timeseries in dynamic_dense_timeseries],
        "cat": [property_cat[i] for property_cat in property_cats],
        "id": ids[i]
    }
    for i in range(num_timeseries)
]
print(len(training_data), len(timeseries[0][start_dataset:end_training][:-1].tolist()), len(dense_timeseries[0][start_dataset:end_training][:-1].tolist()))

In [None]:
test_data = [
    {
        "start": str(timeseries[i].index[0]),
        "target": timeseries[i][start_dataset:end_test][:-1].tolist(),
        "dynamic_feat": [dense_timeseries[i][start_dataset:end_test][:-1].tolist() for dense_timeseries in dynamic_dense_timeseries],
        "cat": [property_cat[i] for property_cat in property_cats],
        "id": ids[i]
    }
    for i in range(num_timeseries)
]
print(len(test_data), len(timeseries[0][start_dataset:end_test][:-1].tolist()), len(dense_timeseries[0][start_dataset:end_test][:-1].tolist()))

In [None]:
predict_data = [
    {
        "start": str(timeseries[i].index[0]),
        "target": timeseries[i][start_dataset:end_predict].tolist(),
        "dynamic_feat": [dense_timeseries[i][start_dataset:end_predict].tolist() for dense_timeseries in dynamic_dense_timeseries],
        "cat": [property_cat[i] for property_cat in property_cats],
        "id": ids[i]
    }
    for i in range(num_timeseries)
]
print(len(predict_data), len(timeseries[0][start_dataset:end_predict].tolist()), len(dense_timeseries[0][start_dataset:end_predict].tolist()))

In [None]:
def write_dicts_to_file(path, data):
    with open(path, 'wb') as fp:
        for d in data:
            fp.write(json.dumps(d).replace('NaN', '"NaN"').encode("utf-8"))
            fp.write("\n".encode('utf-8'))

In [None]:
%%time
write_dicts_to_file("train_"+freq+".json", training_data)
write_dicts_to_file("test_"+freq+".json", test_data)
write_dicts_to_file("predict_"+freq+".json", predict_data)

In [None]:
s3 = boto3.resource('s3')
def copy_to_s3(local_file, s3_path, override=False):
    assert s3_path.startswith('s3://')
    split = s3_path.split('/')
    bucket = split[2]
    path = '/'.join(split[3:])
    buk = s3.Bucket(bucket)
    
    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print('File s3://{}/{} already exists.\nSet override to upload anyway.\n'.format(s3_bucket, s3_path))
            return
        else:
            print('Overwriting existing file')
    with open(local_file, 'rb') as data:
        print('Uploading file to {}'.format(s3_path))
        buk.put_object(Key=path, Body=data)

In [None]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()             # IAM role to use by SageMaker
region = sagemaker_session.boto_region_name

s3_bucket = sagemaker_session.default_bucket()  # replace with an existing bucket if needed
s3_prefix = 'time_series_forecast'    # prefix used for all data stored within the bucket
s3_data_path = "s3://{}/{}/data".format(s3_bucket, s3_prefix)

In [None]:
%%time
copy_to_s3("train_"+freq+".json", s3_data_path + "/train/train_"+freq+".json", override=True)
copy_to_s3("test_"+freq+".json", s3_data_path + "/test/test_"+freq+".json", override=True)
copy_to_s3("predict_"+freq+".json", s3_data_path + "/predict/predict_"+freq+".json", override=True)