## ETF Compete:  
## Feature engineering: CandleStick (K棒) 

## 以連續的 n 根 K 棒來預測後續漲跌(名嘴說： 這是技術分析的基礎，魔法K線)

## 
### Date: 18425_L

# 使用變數

$$$$
# K_body = K 棒實體 (>0, 紅K) 
# up_shadow = 上引線
# low_shadow = 下引線
# open_gap = 開盤位置 (今天開盤 - 昨日收盤)




###  參數: 
### df: dataframe 
### n: n days window (坊間書籍用 3 , using 5 here )


###  return df 
### feature columns = ['open_gap', 'K_body', 'up_shadow', 'low_shadow']
### note： NaN value from Row 0 

In [1]:
import numpy as np
import pandas as pd
from collections import OrderedDict
from time import time

In [2]:
# read in csv, change column names
etf = pd.read_csv('/Users/LarryGuo/Desktop/nano_degree/Capstone_Talk/ETF_compete/twetf_utf8/tetfp.csv')
col_dtypes = OrderedDict(code=str, date=str, name=str, open=float, high=float, low=float, close=float, volume=int)

etf.columns=col_dtypes.keys()

etf.head()

Unnamed: 0,code,date,name,open,high,low,close,volume
0,50,20130102,元大台灣50,54.0,54.65,53.9,54.4,16487
1,50,20130103,元大台灣50,54.9,55.05,54.65,54.85,29020
2,50,20130104,元大台灣50,54.85,54.85,54.4,54.5,9837
3,50,20130107,元大台灣50,54.55,54.55,53.9,54.25,8910
4,50,20130108,元大台灣50,54.0,54.2,53.65,53.9,12507


In [3]:

etf= etf[:1286] # select 台灣50 
etf.tail()

Unnamed: 0,code,date,name,open,high,low,close,volume
1281,50,20180327,元大台灣50,83.0,83.4,82.95,83.4,3277
1282,50,20180328,元大台灣50,82.9,82.9,82.2,82.25,4161
1283,50,20180329,元大台灣50,82.25,82.35,81.8,82.1,4099
1284,50,20180330,元大台灣50,82.65,83.05,82.65,82.85,4994
1285,50,20180331,元大台灣50,82.85,83.05,82.75,82.95,878


In [4]:
etf.columns

Index(['code', 'date', 'name', 'open', 'high', 'low', 'close', 'volume'], dtype='object')

In [5]:



def CandleStick(df,n=5):
    
    
    start_time=time()

    # 計算 開盤高度 (vs 昨日收盤)
    
    df['prev_close'] = df['close'].shift(1)
    
    df['open_gap'] = df['open'] - df['prev_close']
    
    # 計算今日是紅K 還是黑K
    
    df['K_body'] = df['close'] - df['open'] # 收盤高於開盤， 紅K
    

    # 由 黑K, 紅K 計算上下引線
    # 計算 上下引線， 依據 K棒實體紅 黑K
    # 上引線： 若紅K, 最高 - 收盤;  若黑K 最高 - 開盤 
    #下引線： 若紅K, 開盤 - 最低;  若黑K 收盤 - 最低
    
    df['up_shadow'] = np.where(df['K_body']>=0,df['high'] - df['close'], df['high'] - df['open'] )
    
    
    df['low_shadow'] = np.where(df['K_body']>=0, df['open'] - df['low'],df['close'] - df['low'] )
    
    

    
    
    
    return df
 
   

In [6]:
df= CandleStick(etf)

In [7]:
df.columns

Index(['code', 'date', 'name', 'open', 'high', 'low', 'close', 'volume',
       'prev_close', 'open_gap', 'K_body', 'up_shadow', 'low_shadow'],
      dtype='object')

In [8]:
df

Unnamed: 0,code,date,name,open,high,low,close,volume,prev_close,open_gap,K_body,up_shadow,low_shadow
0,50,20130102,元大台灣50,54.00,54.65,53.90,54.40,16487,,,0.40,0.25,0.10
1,50,20130103,元大台灣50,54.90,55.05,54.65,54.85,29020,54.40,0.50,-0.05,0.15,0.20
2,50,20130104,元大台灣50,54.85,54.85,54.40,54.50,9837,54.85,0.00,-0.35,0.00,0.10
3,50,20130107,元大台灣50,54.55,54.55,53.90,54.25,8910,54.50,0.05,-0.30,0.00,0.35
4,50,20130108,元大台灣50,54.00,54.20,53.65,53.90,12507,54.25,-0.25,-0.10,0.20,0.25
5,50,20130109,元大台灣50,53.75,54.30,53.75,54.10,7529,53.90,-0.15,0.35,0.20,0.00
6,50,20130110,元大台灣50,54.30,54.65,54.15,54.50,13953,54.10,0.20,0.20,0.15,0.15
7,50,20130111,元大台灣50,54.70,54.80,54.35,54.45,11837,54.50,0.20,-0.25,0.10,0.10
8,50,20130114,元大台灣50,54.00,54.50,53.80,54.50,7282,54.45,-0.45,0.50,0.00,0.20
9,50,20130115,元大台灣50,54.20,54.45,53.90,54.00,6609,54.50,-0.30,-0.20,0.25,0.10
