In [1]:
#import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import Row
from pyspark.sql.functions import col, udf, lag, rank, lit
from pyspark.sql.window import Window
import talib

In [2]:
Path = "c:/D Drive/work/bigData/pySpark/TBrain_Round2_DataSet_20180518"

In [3]:
#define alias of columns
col_alias_etf= {"代碼":"etf_id", "日期": "etf_date", "中文簡稱": "etf_name", "開盤價(元)":"etf_open", 
            "最高價(元)":"etf_high", "最低價(元)":"etf_low", "收盤價(元)":"etf_close", "成交張數(張)":"etf_count"}
col_alias_stock= {"代碼":"stock_id", "日期": "stock_date", "中文簡稱": "stock_name", "開盤價(元)":"stock_open", 
            "最高價(元)":"stock_high", "最低價(元)":"stock_low", "收盤價(元)":"stock_close", "成交張數(張)":"stock_count"}#udf
def to_double(str_val):
    return float(str_val.replace(",",""))
to_double=udf(to_double)

In [4]:
#def function to read data (因檔案格式都相同)
def read_data(file_name, col_alias):
    str_cols = ["代碼","日期", "中文簡稱"]
    raw_data = spark.read.option("encoding", "Big5").csv(Path + "/" + file_name, header=True, sep=",")
    print("Total " + file_name + " count: " + str(raw_data.count()))
    #rename cols and correct type 
    num_cols = [col_name for col_name in raw_data.columns if col_name not in str_cols]
    final_data=raw_data.select( [col(str_col_name).alias(col_alias[str_col_name]) for str_col_name in str_cols] + 
                                  [to_double(col(num_col_name)).cast("double").alias(col_alias[num_col_name]) for num_col_name in num_cols] )
    final_data.printSchema()
    final_data.show(5)
    return final_data

In [5]:
print("starting import tetfp.csv(台灣18檔ETF股價資料)...")
tetfp_dt=read_data("tetfp.csv", col_alias_etf)

starting import tetfp.csv(台灣18檔ETF股價資料)...
Total tetfp.csv count: 19305
root
 |-- etf_id: string (nullable = true)
 |-- etf_date: string (nullable = true)
 |-- etf_name: string (nullable = true)
 |-- etf_open: double (nullable = true)
 |-- etf_high: double (nullable = true)
 |-- etf_low: double (nullable = true)
 |-- etf_close: double (nullable = true)
 |-- etf_count: double (nullable = true)

+-------+--------+----------------+--------+--------+-------+---------+---------+
| etf_id|etf_date|        etf_name|etf_open|etf_high|etf_low|etf_close|etf_count|
+-------+--------+----------------+--------+--------+-------+---------+---------+
|0050   |20130102|元大台灣50          |    54.0|   54.65|   53.9|     54.4|  16487.0|
|0050   |20130103|元大台灣50          |    54.9|   55.05|  54.65|    54.85|  29020.0|
|0050   |20130104|元大台灣50          |   54.85|   54.85|   54.4|     54.5|   9837.0|
|0050   |20130107|元大台灣50          |   54.55|   54.55|   53.9|    54.25|   8910.0|
|0050   |20130108|元大台灣50     

In [6]:
#取出etf的distinct id
etf_ids = []
for row in tetfp_dt.select("etf_id").distinct().orderBy("etf_id").collect():
    etf_ids.append(row["etf_id"])
etf_ids

['0050   ',
 '0051   ',
 '0052   ',
 '0053   ',
 '0054   ',
 '0055   ',
 '0056   ',
 '0057   ',
 '0058   ',
 '0059   ',
 '006201 ',
 '006203 ',
 '006204 ',
 '006208 ',
 '00690  ',
 '00692  ',
 '00701  ',
 '00713  ']

In [8]:
etf_dic = {}
for etfid in etf_ids:
    export_dt = tetfp_dt.filter("etf_id='" +etfid+ "'").orderBy("etf_id", "etf_date", ascending=True)
    export_pd = export_dt.toPandas()
    etf_dic.update({etfid.strip(): export_pd})

In [9]:
#functions of TA lib
print(talib.get_functions())

['HT_DCPERIOD', 'HT_DCPHASE', 'HT_PHASOR', 'HT_SINE', 'HT_TRENDMODE', 'ADD', 'DIV', 'MAX', 'MAXINDEX', 'MIN', 'MININDEX', 'MINMAX', 'MINMAXINDEX', 'MULT', 'SUB', 'SUM', 'ACOS', 'ASIN', 'ATAN', 'CEIL', 'COS', 'COSH', 'EXP', 'FLOOR', 'LN', 'LOG10', 'SIN', 'SINH', 'SQRT', 'TAN', 'TANH', 'ADX', 'ADXR', 'APO', 'AROON', 'AROONOSC', 'BOP', 'CCI', 'CMO', 'DX', 'MACD', 'MACDEXT', 'MACDFIX', 'MFI', 'MINUS_DI', 'MINUS_DM', 'MOM', 'PLUS_DI', 'PLUS_DM', 'PPO', 'ROC', 'ROCP', 'ROCR', 'ROCR100', 'RSI', 'STOCH', 'STOCHF', 'STOCHRSI', 'TRIX', 'ULTOSC', 'WILLR', 'BBANDS', 'DEMA', 'EMA', 'HT_TRENDLINE', 'KAMA', 'MA', 'MAMA', 'MAVP', 'MIDPOINT', 'MIDPRICE', 'SAR', 'SAREXT', 'SMA', 'T3', 'TEMA', 'TRIMA', 'WMA', 'CDL2CROWS', 'CDL3BLACKCROWS', 'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK', 'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU', 'CDLCONCEALBABYSWALL', 'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER', 'CDLDOJI', 'CDLDOJISTAR',

In [31]:
etf_dic["0050"]["etf_close"].count()

1317

In [112]:
import math
def get_feature_pre(series):
    #在feature前加上一個Nan後回傳
    rtn_list = series.tolist()
    rtn_list = [None] + rtn_list
    return rtn_list[:-1]

def checkNan(num):
    if num == None:
        return True
    elif math.isnan(num):
        return True
    else:
        return False
    
def calculate_nMa(close_price, ma):
#     (close price - MA)/MA
    if checkNan(close_price) or checkNan(ma):
        return None
    else:
        return (close_price - ma)/ma

def create_feature(etf_pd):
    close_ser = etf_pd["etf_close"]
    #EMA
    ema5=get_feature_pre(talib.EMA(close_ser,timeperiod=5))
    ema10=get_feature_pre(talib.EMA(close_ser,timeperiod=10))
    ema20=get_feature_pre(talib.EMA(close_ser,timeperiod=20))
    ema30=get_feature_pre(talib.EMA(close_ser,timeperiod=30))
    #BIAS
    #nBIAS -3, 6, 20 => (close price - MA)/MA   ,Paper 建議用20日MA
    ma3 = get_feature_pre(talib.MA(np.array(close_ser), timeperiod=3))
    ma6 = get_feature_pre(talib.MA(np.array(close_ser), timeperiod=6))
    ma20 = get_feature_pre(talib.MA(np.array(close_ser), timeperiod=20))
    
    
    rtn_pd = pd.DataFrame({"etf_id": etf_pd["etf_id"], "etf_date": etf_pd["etf_date"], "etf_close": etf_pd["etf_close"], 
                           "ema5": ema5, "ema10": ema10, "ema20": ema20, "ema30" : ema30})
 
    return rtn_pd

In [113]:
close_ser = etf_dic["0050"]["etf_close"]
ma3 = get_feature_pre(talib.MA(np.array(close_ser), timeperiod=3))
len(ma3)
bias = []
for cprice, ma in zip(get_feature_pre(close_ser), ma3):
    bias.append(calculate_nMa(cprice, ma))
bias

[None,
 None,
 None,
 -0.0015267175572519517,
 -0.005195599022004856,
 -0.005840762373194036,
 0.00030816640986130756,
 0.006153846153846198,
 0.0018399264029439082,
 0.00030590394616075765,
 -0.005830009205277753,
 -0.006781750924784225,
 -0.006832298136645972,
 0.00559179869524705,
 0.0034129692832765212,
 0.0015441630636195622,
 0.00030873726458794987,
 -0.004018547140649053,
 -0.004342431761786486,
 0.0018639328984156836,
 0.010819165378670833,
 0.005833589192508505,
 0.0006108735491754162,
 0.0024390243902438938,
 0.006075334143377798,
 -1.291895883200182e-16,
 0.0036231884057968946,
 0.004819277108433589,
 0.0018001800180015696,
 0.0056767254257543386,
 -0.0020901761719919016,
 -0.004781829049611713,
 0.0023952095808379313,
 0.0005988023952092913,
 -0.007803121248499721,
 -0.002409638554217115,
 0.0030175015087504533,
 -0.006652555185969421,
 0.0015119443604473203,
 0.005743651753325073,
 0.0009033423667566925,
 0.004500450045004244,
 0.00359066427289028,
 -0.0035874439461883916,

In [95]:
#EMA
create_feature(etf_dic["0050"])

Unnamed: 0,ema10,ema20,ema30,ema5,etf_close,etf_date,etf_id
0,,,,,54.40,20130102,0050
1,,,,,54.85,20130103,0050
2,,,,,54.50,20130104,0050
3,,,,,54.25,20130107,0050
4,,,,,53.90,20130108,0050
5,,,,54.380000,54.10,20130109,0050
6,,,,54.286667,54.50,20130110,0050
7,,,,54.357778,54.45,20130111,0050
8,,,,54.388519,54.50,20130114,0050
9,,,,54.425679,54.00,20130115,0050
