Url: https://tbrain.trendmicro.com.tw/Competitions/Details/2

In [1]:
#import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import Row
from pyspark.sql.functions import col, udf, lag, rank, lit
from pyspark.sql.window import Window

In [2]:
global Path
if sc.master[0:5]=="local":
    #Path = "file:/c:/D Drive/work/bigData/pySpark/TBrain_Round2_DataSet_20180427"
    #Path = "file:/Users/yungchuanlee/Documents/learn/AI競賽/ETF預測/TBrain_Round2_DataSet_20180427"
    Path = "file:/home/hduser/app/bigdata/competition/etf/TBrain_Round2_DataSet_20180427"
else:
    Path = "hdfs://master:9000/user/hduser"

In [3]:
sc.master

'local[*]'

In [4]:
float("    46.57")+float("     29,020".replace(",",""))

29066.57

In [5]:
#define alias of columns
col_alias_etf= {"代碼":"etf_id", "日期": "etf_date", "中文簡稱": "etf_name", "開盤價(元)":"etf_open", 
            "最高價(元)":"etf_high", "最低價(元)":"etf_low", "收盤價(元)":"etf_close", "成交張數(張)":"etf_count"}
col_alias_stock= {"代碼":"stock_id", "日期": "stock_date", "中文簡稱": "stock_name", "開盤價(元)":"stock_open", 
            "最高價(元)":"stock_high", "最低價(元)":"stock_low", "收盤價(元)":"stock_close", "成交張數(張)":"stock_count"}

In [6]:
#udf
def to_double(str_val):
    return float(str_val.replace(",",""))
to_double=udf(to_double)

In [7]:
#def function to read data (因檔案格式都相同)
def read_data(file_name, col_alias):
    str_cols = ["代碼","日期", "中文簡稱"]
    raw_data = spark.read.option("encoding", "Big5").csv(Path + "/" + file_name, header=True, sep=",")
    print("Total " + file_name + " count: " + str(raw_data.count()))
    #rename cols and correct type 
    num_cols = [col_name for col_name in raw_data.columns if col_name not in str_cols]
    final_data=raw_data.select( [col(str_col_name).alias(col_alias[str_col_name]) for str_col_name in str_cols] + 
                                  [to_double(col(num_col_name)).cast("double").alias(col_alias[num_col_name]) for num_col_name in num_cols] )
    final_data.printSchema()
    final_data.show(5)
    return final_data

In [8]:
print("starting import tetfp.csv(台灣18檔ETF股價資料)...")
tetfp_dt=read_data("tetfp.csv", col_alias_etf)

starting import tetfp.csv(台灣18檔ETF股價資料)...
Total tetfp.csv count: 19053
root
 |-- etf_id: string (nullable = true)
 |-- etf_date: string (nullable = true)
 |-- etf_name: string (nullable = true)
 |-- etf_open: double (nullable = true)
 |-- etf_high: double (nullable = true)
 |-- etf_low: double (nullable = true)
 |-- etf_close: double (nullable = true)
 |-- etf_count: double (nullable = true)

+-------+--------+----------------+--------+--------+-------+---------+---------+
| etf_id|etf_date|        etf_name|etf_open|etf_high|etf_low|etf_close|etf_count|
+-------+--------+----------------+--------+--------+-------+---------+---------+
|0050   |20130102|元大台灣50          |    54.0|   54.65|   53.9|     54.4|  16487.0|
|0050   |20130103|元大台灣50          |    54.9|   55.05|  54.65|    54.85|  29020.0|
|0050   |20130104|元大台灣50          |   54.85|   54.85|   54.4|     54.5|   9837.0|
|0050   |20130107|元大台灣50          |   54.55|   54.55|   53.9|    54.25|   8910.0|
|0050   |20130108|元大台灣50     

In [9]:
#EDA
#range of date
tetfp_dt.describe('etf_date').show()

+-------+-------------------+
|summary|           etf_date|
+-------+-------------------+
|  count|              19053|
|   mean|2.015329328116307E7|
| stddev| 15717.456119401833|
|    min|           20130102|
|    max|           20180427|
+-------+-------------------+



In [10]:
# print("starting import taetfp.csv(台灣18檔ETF調整後股價資料)...")
# taetfp_dt=read_data("taetfp.csv", col_alias_etf)

In [11]:
# print("starting import tsharep.csv(台灣個股股價資料)...")
# tsharep_dt=read_data("tsharep.csv", col_alias_stock)

In [12]:
# print("starting import tasharep.csv(台灣個股調整後股價資料)...")
# tasharep_dt=read_data("tasharep.csv", col_alias_stock)

In [13]:
import sys
from pyspark.sql.functions import lag, col, avg,collect_list, lit
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, DoubleType, IntegerType
#declare previous row windows
wsSpec_etf = Window.partitionBy('etf_id').orderBy('etf_date') #time window for normal case
wsSpec_etf_close_price_raw = Window.partitionBy('etf_id').orderBy('row_idx').rangeBetween(-sys.maxsize, -1)
wsSpec_etf_dif_raw = Window.partitionBy('etf_id').orderBy('row_idx').rangeBetween(-sys.maxsize, 0)
def avg_list(p_list):
    #計算數字list的平均值
    return sum(p_list)/len(p_list)
#計算EMA的udf
def calculate_ema_native(close_p_list, window_len):
    #透過歷史收盤價計算
    if len(close_p_list) < window_len:
        return None
    elif len(close_p_list) == window_len:
        #if len of list = win_len then return avg, 
        return avg_list(close_p_list)
    else:
        #else EMA[t] =(EMA[t-1]*(win_len-1)+close[t]*2)/(win_len+1)
        ema = avg_list(close_p_list[:window_len])
        for price in close_p_list[window_len:]:
            ema = (ema*(window_len-1)+price*2)/(window_len+1)
        return ema
calculate_ema=udf(calculate_ema_native, DoubleType())
#計算BIAS的udf
def calculate_bias(close_p_list):
    #計算前日收盤價與N日均線之差比: (close price - MA)/MA   ,Paper 建議用20日MA
    #因要預測今日的收盤價，故計算前日收盤價與前20日均線
    if len(close_p_list) < 21:
        return None
    else:
        list_len = len(close_p_list)
        p_close = close_p_list[-1]
        cal_list = close_p_list[list_len-21: list_len-1]
        return p_close - avg_list(cal_list)
calculate_bias=udf(calculate_bias, DoubleType())

def get_min_max_last(p_list):
    #找出list中最大最小和最後一個值, 回傳(min, max, last)
    return (min(p_list), max(p_list), p_list[-1])
def calculate_raw_rsv(p_list):
    #RSV = (收盤價-9日低值)/(9日高值-9日低值)
    p_min, p_max, p_last = get_min_max_last(p_list)
    rsv = (p_last - p_min)/(p_max - p_min)
    return rsv
def calculate_rsv(p_9_list, k_prev, d_prev):
    #計算加權後的RSV，p_9_list=>9日收盤價
    rrsv = calculate_raw_rsv(p_9_list)
    k_curr = (1/3)*rrsv + (2/3)*k_prev
    d_curr = (1/3)*k_curr + (2/3)*d_prev
    return [k_curr, d_curr]
#計算隨機指標（Stochastic Oscillator，KD），原名%K&%D
def calculate_KD(close_p_list):
    win_len = 9 #看過去 9 日值
    #RSV = (收盤價-9日低值)/(9日高值-9日低值)
    #K_curr = 1/3*RSV + 2/3*K_prev
    #D_curr = 1/3*K_curr + 2/3*D_prev
    if len(close_p_list) < win_len:
        return None
    elif len(close_p_list) == win_len:
        #無前日K, D時，以0.5帶入
        return calculate_rsv(close_p_list, 0.5, 0.5)
    else:
        kds = calculate_rsv(close_p_list[0:9], 0.5, 0.5)
        for idx in range(1, (len(close_p_list)+1-9)):
            p_9_list = close_p_list[idx: idx+9]
            kds = calculate_rsv(p_9_list, kds[0], kds[1])
        return kds
calculate_KD=udf(calculate_KD, ArrayType(DoubleType()))

#計算差離值DIF = 12日EMA - 26日EMA
def calculate_DIF(close_p_list):
    if len(close_p_list) < 26:
        return None
    else:
        ema12 = calculate_ema_native(close_p_list, 12)
        ema26 = calculate_ema_native(close_p_list, 26)
        return ema12 - ema26
calculate_DIF=udf(calculate_DIF, DoubleType())

#計算MACD=(前一日MACD × (9 - 1) + 今日DIF × 2) ÷ (9 + 1)
def calculate_MACD(dif_list, dif_curr):
    win_len = 9
    if len(dif_list) < win_len:
        return None
    elif len(dif_list) == win_len:
        #if len of list = win_len then return avg, 
        return avg_list(dif_list)
    else:
        #MACD=(前一日MACD × (9 - 1) + 今日DIF × 2) ÷ (9 + 1)
        macd = avg_list(dif_list[:win_len])
        for price in dif_list[win_len:]:
            macd = (macd*(win_len-1)+dif_curr*2)/(win_len+1)
        return macd
calculate_MACD=udf(calculate_MACD, DoubleType())

#計算相對強弱指數(RSI)
def calculate_RSI(close_p_list):
    win_len = 9
    if len(close_p_list) < (win_len + 1):
        return None
    else:
        cur_list = close_p_list[1:]
        prv_list = close_p_list[0:-1]
        p_dif_list = list(map(lambda x,y : x - y, cur_list, prv_list)) #dif list
        u_list = []
        d_list = []
        for dif in p_dif_list:
            if dif == 0:
                #若兩天價格相同，則U及D皆等於零
                u_list.append(0)
                d_list.append(0)
            elif dif > 0:
                #在價格上升的日子, U = diff, D = 0
                u_list.append(dif)
                d_list.append(0)
            else:
                #在價格下跌的日子, U = 0, D = abs(diff)
                u_list.append(0)
                d_list.append(abs(dif))
        #RSI = ema(u,9)/(ema(u,9)+ema(d,9))
        ema_u = calculate_ema_native(u_list, win_len)
        ema_d = calculate_ema_native(d_list, win_len)
        return ema_u/(ema_u + ema_d)
calculate_RSI=udf(calculate_RSI, DoubleType())

#計算威廉指標（Williams %R）
def calculate_WR(close_p_list):
    win_len = 9
    if len(close_p_list) < win_len:
        return None
    else:
        p_list = close_p_list[len(close_p_list) - win_len :]
        return 1.0 - calculate_raw_rsv(p_list)
calculate_WR=udf(calculate_WR, DoubleType())


In [14]:
#calculate ema [5,10,20] #cannot remove row_idx, row_idx for next window usage
tetfp_dt2=tetfp_dt.withColumn("row_idx", rank().over(wsSpec_etf)) \
    .withColumn("close_price_raw", collect_list(col('etf_close')).over(wsSpec_etf_close_price_raw)) \
    .withColumn("EMA5", calculate_ema(col("close_price_raw"), lit(5))) \
    .withColumn("EMA10", calculate_ema(col("close_price_raw"), lit(10))) \
    .withColumn("EMA20", calculate_ema(col("close_price_raw"), lit(20))) \
    .withColumn("BIAS", calculate_bias(col("close_price_raw"))) \
    .withColumn("KD", calculate_KD(col("close_price_raw"))) \
    .withColumn("K", col("KD")[0]).withColumn("D", col("KD")[1]) \
    .withColumn("DIF", calculate_DIF(col("close_price_raw"))) \
    .withColumn("dif_list", collect_list(col('DIF')).over(wsSpec_etf_dif_raw)) \
    .withColumn("MACD", calculate_MACD(col("dif_list"), col("DIF"))) \
    .withColumn("RSI", calculate_RSI(col("close_price_raw")))\
    .withColumn("WR", calculate_WR(col("close_price_raw")))

tetfp_dt2.cache()
tetfp_dt2.printSchema()

root
 |-- etf_id: string (nullable = true)
 |-- etf_date: string (nullable = true)
 |-- etf_name: string (nullable = true)
 |-- etf_open: double (nullable = true)
 |-- etf_high: double (nullable = true)
 |-- etf_low: double (nullable = true)
 |-- etf_close: double (nullable = true)
 |-- etf_count: double (nullable = true)
 |-- row_idx: integer (nullable = true)
 |-- close_price_raw: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- EMA5: double (nullable = true)
 |-- EMA10: double (nullable = true)
 |-- EMA20: double (nullable = true)
 |-- BIAS: double (nullable = true)
 |-- KD: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- K: double (nullable = true)
 |-- D: double (nullable = true)
 |-- DIF: double (nullable = true)
 |-- dif_list: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- MACD: double (nullable = true)
 |-- RSI: double (nullable = true)
 |-- WR: double (nullable = true)



In [15]:
# tetfp_dt2.filter("etf_id='0050   '").select("row_idx", "etf_close", "MACD", "RSI", "WR") \
#             .show(45)
#          .toPandas().to_csv(Path.replace("file:","") + "/taetfp_BIAS.csv",index=False)

In [16]:
#匯出成pandas
# etf_pd_50 = tetfp_dt2.filter("etf_id='0050   '").select("row_idx", "etf_date" ,"etf_close", "EMA5", "EMA10", "EMA20", "BIAS") \
#                 .toPandas()

In [17]:
#圖形化顯示
#aetf_pd_50.set_index('etf_date') #set_index 後配合 loc select 出區段資料來看
# etf_pd_50_part = etf_pd_50.loc[1200:]
# etf_pd_50_part.etf_close.plot(x='row_idx', y='etf_close', style='b--', label="etf_close")
# etf_pd_50_part.EMA5.plot(x='row_idx', y='EMA5', label="EMA5", style='r-')
# etf_pd_50_part.EMA10.plot(x='row_idx', y='EMA10', label="EMA10", style='g-')
# etf_pd_50_part.EMA20.plot(x='row_idx', y='EMA20', label="EMA20", style='y-')
# plt.legend()
# plt.show()

In [18]:
#計算各欄位與收盤價之相關性
# corr_cols = ['EMA5','EMA10','EMA20','BIAS','K','D']
# for col in corr_cols:
#     print('corr between ', col , ' and etf_close: ', str(tetfp_dt2.corr(col, 'etf_close')))

In [22]:
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml.linalg import Vectors
tot_dt = tetfp_dt2.filter("MACD is not null") \
    .select("etf_id", "etf_date", "EMA5", "EMA10", "EMA20", "BIAS", "K", "D", "DIF", "MACD", "RSI", "WR", "etf_close") \
    .orderBy("etf_id", "etf_date", ascending=True)
    
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
#將Feature合併為Vector 並作標準化
assembler = VectorAssembler(
    inputCols=["EMA5", "EMA10", "EMA20", "BIAS", "K", "D", "DIF", "MACD", "RSI", "WR"],
    outputCol="features")
tot_dt_1 = assembler.transform(tot_dt)
#minmax_scaler = MinMaxScaler(inputCol="features", outputCol="stdFeatures")
#scaler_model = minmax_scaler.fit(tot_dt_1)
#std_scaler = StandardScaler(inputCol="features", outputCol="stdFeatures")
#scaler_model = std_scaler.fit(tot_dt_1)
#tot_dt_scale = scaler_model.transform(tot_dt_1)
tot_dt_scale=tot_dt_1.withColumn("stdFeatures", col("features")) #測試不作標準化

In [23]:
#取出4/16~4/27 (共兩週資料作為測試集)
train_dt = tot_dt_scale.filter("etf_date < '20180416' and MACD is not null") \
    .select("etf_id", "etf_date", "stdFeatures", "etf_close") \
    .orderBy("etf_id", "etf_date", ascending=True)
test_dt = tot_dt_scale.filter("etf_date >= '20180416'") \
    .select("etf_id", "etf_date", "stdFeatures", "etf_close") \
    .orderBy("etf_id", "etf_date", ascending=True)
print('train count: ', str(train_dt.count()), ', test count: ', str(test_dt.count()))
train_dt.show(10)
test_dt.show(10)

train count:  18261 , test count:  180
+-------+--------+--------------------+---------+
| etf_id|etf_date|         stdFeatures|etf_close|
+-------+--------+--------------------+---------+
|0050   |20130227|[55.4823388864543...|     55.2|
|0050   |20130301|[55.3882259243028...|     55.4|
|0050   |20130304|[55.3921506162019...|    54.75|
|0050   |20130305|[55.1781004108012...|     55.2|
|0050   |20130306|[55.1854002738675...|    55.45|
|0050   |20130307|[55.2736001825783...|     55.4|
|0050   |20130308|[55.3157334550522...|     55.8|
|0050   |20130311|[55.4771556367014...|     55.9|
|0050   |20130312|[55.6181037578009...|    55.55|
|0050   |20130313|[55.5954025052006...|    55.65|
+-------+--------+--------------------+---------+
only showing top 10 rows

+-------+--------+--------------------+---------+
| etf_id|etf_date|         stdFeatures|etf_close|
+-------+--------+--------------------+---------+
|0050   |20180416|[82.4956392327525...|     82.3|
|0050   |20180417|[82.4304261551683

In [24]:
train_dt.cache()
test_dt.cache()

DataFrame[etf_id: string, etf_date: string, stdFeatures: vector, etf_close: double]

In [25]:
#取出etf的distinct id
etf_ids = []
for row in test_dt.select("etf_id").distinct().collect():
    etf_ids.append(row["etf_id"])
etf_ids

['00701  ',
 '0051   ',
 '0057   ',
 '006203 ',
 '0052   ',
 '0050   ',
 '0055   ',
 '0054   ',
 '0059   ',
 '00690  ',
 '00713  ',
 '006204 ',
 '006208 ',
 '0053   ',
 '006201 ',
 '0056   ',
 '00692  ',
 '0058   ']

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#計算上或下的值
def judge_up_down_native(curr_price, prev_price):
    if prev_price is None:
        return 0.0
    elif curr_price == prev_price:
        return 0.0
    elif curr_price > prev_price:
        return 1.0
    else:
        return 2.0
judge_up_down=udf(judge_up_down_native, DoubleType())

In [27]:
#訓練Model及評估(RandomForestRegressor - etf_id wise) 
#-- accuracy:0.48, (RMSE) on test data = 0.938023 --> using stdScaler
#-- accuracy:0.48, (RMSE) on test data = 0.87064 --> no Scaler
#-- accuracy:0.56, (RMSE) on test data = 0.92546 --> use minmaxScaler
#-- accuracy:0.513812 , (RMSE) on test data = 0.597397 --> use minmaxScaler and no Scaler(full data)
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
rf = RandomForestRegressor(featuresCol="stdFeatures",labelCol="etf_close")
predit_res = None
for etfid in etf_ids:
    train_data = train_dt.filter("etf_id='" + etfid + "'")
    test_data = test_dt.filter("etf_id='" + etfid + "'")
    rf_model = rf.fit(train_data)
    predicts = rf_model.transform(test_data)
    if predit_res is None:
        predit_res = predicts
    else:
        predit_res = predit_res.unionAll(predicts)
predit_res.show(10)

predit_res_2 = predit_res.withColumn("prev_close", lag("etf_close").over(wsSpec_etf)) \
    .withColumn("prev_pred_close", lag("prediction").over(wsSpec_etf)) \
    .withColumn("act_ud", judge_up_down(col("etf_close"), col("prev_close"))) \
    .withColumn("pred_ud", judge_up_down(col("prediction"), col("prev_pred_close")))
predit_res_2.printSchema()
predit_res_2.select("etf_id", "etf_date", "etf_close", "prev_close", "prediction", "act_ud", "pred_ud").show(10)

#評估RMES
evaluator = RegressionEvaluator(
    labelCol="etf_close", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predit_res)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
evaluator = MulticlassClassificationEvaluator(
    labelCol="act_ud", predictionCol="pred_ud", metricName="accuracy")
accuracy = evaluator.evaluate(predit_res_2)
print("accuracy = %g " % accuracy)

+-------+--------+--------------------+---------+------------------+
| etf_id|etf_date|         stdFeatures|etf_close|        prediction|
+-------+--------+--------------------+---------+------------------+
|00701  |20180416|[21.2097304703123...|    21.21|21.206789696986018|
|00701  |20180417|[21.2098203135415...|    20.95|21.174685121822623|
|00701  |20180418|[21.1232135423610...|    21.04|21.008363428238425|
|00701  |20180419|[21.0954756949073...|    21.32|21.025511047286045|
|00701  |20180420|[21.1703171299382...|     21.0|21.207956363652684|
|00701  |20180423|[21.1135447532921...|    20.92|20.974676198801195|
|00701  |20180424|[21.0490298355281...|    20.88|20.973593406593405|
|00701  |20180425|[20.9926865570187...|    20.81|20.819361263736265|
|00701  |20180426|[20.9317910380124...|    20.75|20.701266025641026|
|00701  |20180427|[20.8711940253416...|    20.86|20.530458333333335|
+-------+--------+--------------------+---------+------------------+
only showing top 10 rows

root
 |-

In [39]:
#find last records of all etf to be the base of next record
predict_start_date = "20180416"
tetf_dt_prod = tetfp_dt2.filter("etf_date < '" + predict_start_date + "'")
tetf_max_idx = tetf_dt_prod.groupBy("etf_id").max("row_idx")
tetf_max = tetf_max_idx.select(col("etf_id"), col("max(row_idx)").cast("Double").alias("row_idx")) \
    .join(tetf_dt_prod, ["etf_id", "row_idx"], "inner") \
    .select("etf_id", "etf_date", "row_idx", "etf_close", "close_price_raw") \
    .orderBy('etf_id')

tetf_max.show()

+-------+--------+-------+---------+--------------------+
| etf_id|etf_date|row_idx|etf_close|     close_price_raw|
+-------+--------+-------+---------+--------------------+
|0050   |20180413| 1293.0|     82.5|[54.4, 54.85, 54....|
|0051   |20180413| 1293.0|    32.93|[26.09, 26.12, 26...|
|0052   |20180413| 1293.0|    57.05|[32.72, 32.12, 32...|
|0053   |20180413| 1293.0|    36.46|[23.26, 23.11, 22...|
|0054   |20180413| 1293.0|    24.05|[19.4, 19.37, 19....|
|0055   |20180413| 1293.0|    16.93|[11.47, 11.49, 11...|
|0056   |20180413| 1293.0|    25.99|[22.95, 23.06, 22...|
|0057   |20180413| 1293.0|     51.2|[31.94, 31.93, 31...|
|0058   |20180413| 1293.0|    46.49|[32.12, 31.96, 32...|
|0059   |20180413| 1293.0|     41.9|[24.65, 24.67, 24...|
|006201 |20180413| 1293.0|    14.69|[10.24, 10.28, 10...|
|006203 |20180413| 1293.0|    38.75|[26.11, 26.5, 26....|
|006204 |20180413| 1293.0|     54.4|[38.82, 39.06, 38...|
|006208 |20180413| 1293.0|    48.29|[31.02, 31.28, 31...|
|00690  |20180

In [46]:
schema = tetfp_dt2.select("etf_id", "etf_date", "etf_name", "etf_open", "etf_high", "etf_low", 
                          "etf_close", "etf_count", col("row_idx").cast("Double"), 
                          "close_price_raw").schema
next_date_range = ["20180416", "20180417", "20180418","20180419", "20180420"]
for date in next_date_range:
    next_rows = []
    for row in tetf_max.collect():
        close_price_raw = row["close_price_raw"]
        close_price_raw.append(row["etf_close"])
        next_row = (row["etf_id"], date, "test name", 0.0, 0.0, 0.0, 
                   0.0, 0.0, row["row_idx"]+1.0, close_price_raw)
        next_rows.append(next_row)
    test_dt = spark.createDataFrame(next_rows, schema)
    test_dt.show()

+-------+--------+---------+--------+--------+-------+---------+---------+-------+--------------------+
| etf_id|etf_date| etf_name|etf_open|etf_high|etf_low|etf_close|etf_count|row_idx|     close_price_raw|
+-------+--------+---------+--------+--------+-------+---------+---------+-------+--------------------+
|0050   |20180416|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[54.4, 54.85, 54....|
|0051   |20180416|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[26.09, 26.12, 26...|
|0052   |20180416|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[32.72, 32.12, 32...|
|0053   |20180416|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[23.26, 23.11, 22...|
|0054   |20180416|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[19.4, 19.37, 19....|
|0055   |20180416|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[11.47, 11.49, 11...|
|0056   |20180416|test name|     0.0|     0.0|    0.0|      0.0|

+-------+--------+---------+--------+--------+-------+---------+---------+-------+--------------------+
| etf_id|etf_date| etf_name|etf_open|etf_high|etf_low|etf_close|etf_count|row_idx|     close_price_raw|
+-------+--------+---------+--------+--------+-------+---------+---------+-------+--------------------+
|0050   |20180420|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[54.4, 54.85, 54....|
|0051   |20180420|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[26.09, 26.12, 26...|
|0052   |20180420|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[32.72, 32.12, 32...|
|0053   |20180420|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[23.26, 23.11, 22...|
|0054   |20180420|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[19.4, 19.37, 19....|
|0055   |20180420|test name|     0.0|     0.0|    0.0|      0.0|      0.0| 1294.0|[11.47, 11.49, 11...|
|0056   |20180420|test name|     0.0|     0.0|    0.0|      0.0|

In [None]:
#ll = [46.92, 47.31, 47.0, 46.79, 46.49, 46.66, 47.0, 46.96, 47.0]
ll = [46.92, 47.31, 47.0, 46.79, 46.49, 46.66]
win_len=5
print(ll[1:])
print(ll[0: -1])
print(list(map(lambda x,y : x - y, ll[1:], ll[0: -1])))
for x in ll[win_len:]:
    print(x)
ema = sum(ll[:win_len])/len(ll[:win_len])
print(ema)
for price in ll[win_len:]:
    ema = (ema*(win_len-1)+price*2)/(win_len+1)
tup1, tup2 = (1,2)
print(tup1, ' ', tup2)
tup = (3,4)
print(tup[0], ' ', tup[1])
list(range(0,2))