Study the relationship of: <br>
input: The percentage of current price compared to x days simple moving average<br>
output: The price y days later

In [14]:
import pandas as pd
import talib
import os
import logging
import time

In [13]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("study_trail_1")
handler = logging.FileHandler('study_trail_1.log')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

Read data

In [2]:
stock_data_store = pd.HDFStore('stock_day.h5')

In [3]:
symbols_frame = pd.read_csv("stock_symbols.csv")

In [4]:
symbols_frame.head()

Unnamed: 0,symbols
0,A
1,AA
2,AAAP
3,AAAU
4,AABA


In [5]:
symbol_list = symbols_frame.loc[:, "symbols"].tolist()

In [6]:
symbol_list[:5]

['A', 'AA', 'AAAP', 'AAAU', 'AABA']

Filter symbols to only process stocks with last volumne of 3000000 and above

In [8]:
filter_symbol_list= []
finished_number = 0
for symbol in symbol_list:
    symbol_frame = stock_data_store.select('day', where=['symbol=="{}"'.format(symbol)])
    if symbol_frame.iloc[-1, :].loc["adj_volume"] > 3000000:
        filter_symbol_list.append(symbol)
    finished_number += 1
    if finished_number%500 == 0:
        print("process: {}/{}".format(finished_number, len(symbol_list)))
        

process: 500/14370
process: 1000/14370
process: 1500/14370
process: 2000/14370
process: 2500/14370
process: 3000/14370
process: 3500/14370
process: 4000/14370
process: 4500/14370
process: 5000/14370
process: 5500/14370
process: 6000/14370
process: 6500/14370
process: 7000/14370
process: 7500/14370
process: 8000/14370
process: 8500/14370
process: 9000/14370
process: 9500/14370
process: 10000/14370
process: 10500/14370
process: 11000/14370
process: 11500/14370
process: 12000/14370
process: 12500/14370
process: 13000/14370
process: 13500/14370
process: 14000/14370


In [9]:
len(filter_symbol_list)

1724

In [None]:
filter_symbols_frame = pd.DataFrame({"symbols":filter_symbol_list})
filter_symbols_frame.to_csv("filter_stock_symbols.csv", index=False)

In [8]:
filter_symbol_list = pd.read_csv("filter_stock_symbols.csv").loc[:, "symbols"].tolist()

Main function for evaluation

In [15]:
def get_symbol_test_data(param_set, symbol):

    symbol_frame = stock_data_store.select('day', where=['symbol=="{}"'.format(symbol)])
    close = symbol_frame.loc[:, "adj_close"]
    sma = talib.SMA(close, timeperiod=param_set["sma_period"])
    input_column = (close - sma)/close * 100

    output_column = (close.shift(param_set["out_shift_period"]) - close)/close * 100

    current_test_data = pd.concat([input_column, output_column], axis=1)

    current_test_data.columns = ["input", "output"]

    current_test_data.dropna(inplace=True)
    #test_data = test_data[test_data["input"] > 1]
    daily_change = abs((close - close.shift(1))/close).dropna()
    daily_change_mean= daily_change.mean()

    #test_data
    current_test_data = current_test_data[abs(current_test_data["input"]) > daily_change_mean * 100 * 0.5]
    return current_test_data

Set parameter sets and then proceed all data

In [18]:
final_result_dict = {"sma_period":[], "out_shift_period":[], "correlation":[], "accurate_ratio":[] }
param_set_number = 0
for sma_period in range(2 , 22):
    for out_shift_period in range(-10, 0):
        start_time = time.time()
        # Set parameters
        param_set_number += 1
        final_result_dict["sma_period"].append(sma_period)
        final_result_dict["out_shift_period"].append(out_shift_period)
        param_set = {"sma_period": sma_period, "out_shift_period": out_shift_period}
        
        # Process all the data
        symbol_test_data_list = []
        for symbol in filter_symbol_list:
            symbol_test_data = get_symbol_test_data(param_set, symbol)
            symbol_test_data_list.append(symbol_test_data)
            
        test_data = pd.concat(symbol_test_data_list)
        
        # Calculate metrics
        correlation = test_data.loc[:, "input"].corr(test_data.loc[:, "output"])
        final_result_dict["correlation"].append(correlation)
        
        product = (test_data.loc[:, "input"] * test_data.loc[:, "output"]) > 0
        accurate_ratio = len(product[product>0])/len(product)
        final_result_dict["accurate_ratio"].append(accurate_ratio)
        
        # Save test data
        test_data.to_csv(os.path.join("test_data", "sma_period_{}_out_shift_period_{}".format(sma_period, out_shift_period) + ".csv"))
        end_time = time.time()
        logger.info("process time: {} sec".format(round(end_time - start_time), 2))
        logger.info("process: {}/{}".format(param_set_number, 20 * 10))

2020-11-23 01:05:45,678 - study_trail_1 - INFO - process time: 88 sec
2020-11-23 01:05:45,679 - study_trail_1 - INFO - process: 1/200
2020-11-23 01:07:07,061 - study_trail_1 - INFO - process time: 81 sec
2020-11-23 01:07:07,063 - study_trail_1 - INFO - process: 2/200
2020-11-23 01:08:28,717 - study_trail_1 - INFO - process time: 82 sec
2020-11-23 01:08:28,719 - study_trail_1 - INFO - process: 3/200
2020-11-23 01:09:50,581 - study_trail_1 - INFO - process time: 82 sec
2020-11-23 01:09:50,585 - study_trail_1 - INFO - process: 4/200
2020-11-23 01:11:18,835 - study_trail_1 - INFO - process time: 88 sec
2020-11-23 01:11:18,837 - study_trail_1 - INFO - process: 5/200
2020-11-23 01:12:42,403 - study_trail_1 - INFO - process time: 84 sec
2020-11-23 01:12:42,404 - study_trail_1 - INFO - process: 6/200
2020-11-23 01:14:07,109 - study_trail_1 - INFO - process time: 85 sec
2020-11-23 01:14:07,113 - study_trail_1 - INFO - process: 7/200
2020-11-23 01:15:39,103 - study_trail_1 - INFO - process time:

2020-11-23 02:54:37,009 - study_trail_1 - INFO - process: 61/200
2020-11-23 02:57:00,683 - study_trail_1 - INFO - process time: 144 sec
2020-11-23 02:57:00,685 - study_trail_1 - INFO - process: 62/200
2020-11-23 02:59:22,494 - study_trail_1 - INFO - process time: 142 sec
2020-11-23 02:59:22,496 - study_trail_1 - INFO - process: 63/200
2020-11-23 03:01:44,856 - study_trail_1 - INFO - process time: 142 sec
2020-11-23 03:01:44,858 - study_trail_1 - INFO - process: 64/200
2020-11-23 03:03:57,810 - study_trail_1 - INFO - process time: 133 sec
2020-11-23 03:03:57,812 - study_trail_1 - INFO - process: 65/200
2020-11-23 03:06:19,740 - study_trail_1 - INFO - process time: 142 sec
2020-11-23 03:06:19,742 - study_trail_1 - INFO - process: 66/200
2020-11-23 03:08:42,244 - study_trail_1 - INFO - process time: 143 sec
2020-11-23 03:08:42,246 - study_trail_1 - INFO - process: 67/200
2020-11-23 03:11:04,106 - study_trail_1 - INFO - process time: 142 sec
2020-11-23 03:11:04,108 - study_trail_1 - INFO -

2020-11-23 05:23:23,747 - study_trail_1 - INFO - process time: 149 sec
2020-11-23 05:23:23,749 - study_trail_1 - INFO - process: 122/200
2020-11-23 05:25:56,030 - study_trail_1 - INFO - process time: 152 sec
2020-11-23 05:25:56,032 - study_trail_1 - INFO - process: 123/200
2020-11-23 05:28:25,918 - study_trail_1 - INFO - process time: 150 sec
2020-11-23 05:28:25,920 - study_trail_1 - INFO - process: 124/200
2020-11-23 05:30:55,929 - study_trail_1 - INFO - process time: 150 sec
2020-11-23 05:30:55,931 - study_trail_1 - INFO - process: 125/200
2020-11-23 05:33:25,787 - study_trail_1 - INFO - process time: 150 sec
2020-11-23 05:33:25,789 - study_trail_1 - INFO - process: 126/200
2020-11-23 05:35:55,884 - study_trail_1 - INFO - process time: 150 sec
2020-11-23 05:35:55,885 - study_trail_1 - INFO - process: 127/200
2020-11-23 05:38:27,482 - study_trail_1 - INFO - process time: 152 sec
2020-11-23 05:38:27,484 - study_trail_1 - INFO - process: 128/200
2020-11-23 05:40:58,519 - study_trail_1 -

2020-11-23 07:56:14,267 - study_trail_1 - INFO - process time: 158 sec
2020-11-23 07:56:14,269 - study_trail_1 - INFO - process: 182/200
2020-11-23 07:58:51,179 - study_trail_1 - INFO - process time: 157 sec
2020-11-23 07:58:51,181 - study_trail_1 - INFO - process: 183/200
2020-11-23 08:01:29,467 - study_trail_1 - INFO - process time: 158 sec
2020-11-23 08:01:29,469 - study_trail_1 - INFO - process: 184/200
2020-11-23 08:04:04,420 - study_trail_1 - INFO - process time: 155 sec
2020-11-23 08:04:04,424 - study_trail_1 - INFO - process: 185/200
2020-11-23 08:06:35,133 - study_trail_1 - INFO - process time: 151 sec
2020-11-23 08:06:35,135 - study_trail_1 - INFO - process: 186/200
2020-11-23 08:09:09,278 - study_trail_1 - INFO - process time: 154 sec
2020-11-23 08:09:09,281 - study_trail_1 - INFO - process: 187/200
2020-11-23 08:11:44,300 - study_trail_1 - INFO - process time: 155 sec
2020-11-23 08:11:44,302 - study_trail_1 - INFO - process: 188/200
2020-11-23 08:14:17,808 - study_trail_1 -

In [21]:
final_result_frame = pd.DataFrame(final_result_dict)

In [22]:
final_result_frame.to_csv("final_result.csv", index=False)

In [26]:
final_result_frame.sort_values(by="accurate_ratio").head()

Unnamed: 0,sma_period,out_shift_period,correlation,accurate_ratio
9,2,-1,-0.000498,0.463708
19,3,-1,-0.000516,0.465449
39,5,-1,-0.00047,0.465675
69,8,-1,-0.000424,0.465744
49,6,-1,-0.000457,0.465746


结论1： input和output几乎没有相关度

结论2： sma_period = 2, out_shift_period = 1 时， input 对output 具有46%正向预测率， input 对 output具有54%反向预测率