In [1]:
import pandas as pd
import numpy as np

## 기술적 지표를 위한 주식 가격 데이터 프레임

전처리의 목적은 기술적 지표와 기본적 지표를 미리 세팅해두고 다음날 값을 예측하기 위함이다.  
따라서 기술적 지표는 다음과 같이 설정한다.(목표 : 오늘의 가격을 토대로 내일의 종가 예측)    
즉, 예를 들면 2020년 2월 20일에 2월 21일 가격을 예측한다고 가정하면,   
- 1번, 4번,5번 은 2월20일과 19일을 이용하여 구한다.  
- 2번, 3번은 2월20일 종가와 시가를 이용한다.
- 6번~8번은 평균을 20일로 잡았으므로 지금 데이터셋이 2017년 1월부터 train set이므로 2017년1월1일을 기준으로 20일전평균을 구하면서 시작한다. 나머지는 하루전의 데이터만 추가로 활용한다.
- 여기서 1~8번 변수는 각각 A,B,C,D,E,F,G,H 라 설정

In [2]:
data1 = pd.read_csv('data/samsung2.csv')
data1 = data1[['Date','Open','High','Low','Close','Volume']]
data1 = data1.set_index('Date')
data1 = data1.loc['2016-12-01':'2019-12-31']

In [3]:
data_145=data1[20:] #1,4,5번은 이를 이용
data_23=data1[21:] # 2,3은 이를 이용
data_678=data1[1:] # 6,7,8은 이를 이용

In [4]:
data_145.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-12-29,35420,36040,35400,36040,7516450
2017-01-02,35980,36240,35880,36100,4650600
2017-01-03,36280,36620,36020,36480,7357650
2017-01-04,36500,36520,36100,36160,7971750
2017-01-05,36060,36060,35540,35560,10967450


In [5]:
data_23.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-02,35980,36240,35880,36100,4650600
2017-01-03,36280,36620,36020,36480,7357650
2017-01-04,36500,36520,36100,36160,7971750
2017-01-05,36060,36060,35540,35560,10967450
2017-01-06,36180,36440,36040,36200,8880950


In [6]:
data_678.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-12-02,34480,34760,34140,34540,15352950
2016-12-05,34340,34680,34220,34360,8484600
2016-12-06,34440,35200,34400,34960,14001800
2016-12-07,35040,35480,35040,35440,9603650
2016-12-08,35980,36020,35520,35800,16462550


#### 1. 시가, 전날 종가 비율

(open - last close) / last close

In [7]:
A=[]
for i in range(len(data_145)-1):    
    A.append(round((data_145['Open'][i+1] - data_145['Close'][i]) / data_145['Close'][i],3))

In [8]:
A=pd.DataFrame(A)
A.columns=['A']

#### 2. 고가, 종가 비율

(high - close) / close

In [9]:
B=[]
for i in range(len(data_23)):    
    B.append(round((data_23['High'][i] - data_23['Close'][i]) / data_23['Close'][i],3))

In [10]:
B=pd.DataFrame(B)
B.columns=['B']

#### 3. 저가, 종가 비율

(low - close) / close

In [11]:
C=[]
for i in range(len(data_23)):    
    C.append(round((data_23['Low'][i] - data_23['Close'][i]) / data_23['Close'][i],3))

In [12]:
C=pd.DataFrame(C)
C.columns=['C']

#### 4. 종가, 전날 종가 비율

(close - last close) / close

In [13]:
D=[]
for i in range(len(data_145)-1):    
    D.append(round((data_145['Close'][i+1] - data_145['Close'][i]) / data_145['Close'][i+1],3))

In [14]:
D=pd.DataFrame(D)
D.columns=['D']

#### 5. 거래량, 전날 거래랑 비율

(volume - last volume) / last volume

In [15]:
E=[]
for i in range(len(data_145)-1):    
    E.append(round((data_145['Volume'][i+1] - data_145['Volume'][i]) / data_145['Volume'][i],3))

In [16]:
E=pd.DataFrame(E)
E.columns=['E']

#### 6. 종가, 종가 20일 이동평균선 비율

(close - close ma20) / close ma20

In [17]:
def moving_average(data, w=5):
        
    ma_w=data.ewm(span=w).mean()
    ma_5=data.ewm(span=5).mean()
    ma_10=data.ewm(span=10).mean()
    ma_15=data.ewm(span=15).mean()
    ma_20=data.ewm(span=20).mean()
    ma_30=data.ewm(span=30).mean()
    ma_60=data.ewm(span=60).mean()
    
    result = np.array([data,ma_w,ma_5,ma_10,ma_15,ma_20,ma_30,ma_60])

    df = pd.DataFrame(result).T
    df.columns = ["close", "user_ma_{}".format(w), "ma_5", "ma_10","ma_15","ma_20","ma_30","ma_60" ]
    
    if w == 5:
        del df['user_ma_5']
    
    
    return df

In [18]:
close_ma20 = moving_average(data_678['Close'])['ma_20']

In [19]:
data_678['Close'][20:]

Date
2017-01-02    36100
2017-01-03    36480
2017-01-04    36160
2017-01-05    35560
2017-01-06    36200
              ...  
2019-12-23    55500
2019-12-24    55000
2019-12-26    55400
2019-12-27    56500
2019-12-30    55800
Name: Close, Length: 731, dtype: int64

In [20]:
F=[]
for i in range(len(data_678)-20):    
    F.append(round((data_678['Close'][i+20] - close_ma20[i]) / close_ma20[i] , 3))

In [21]:
F=pd.DataFrame(F)
F.columns=['F']

#### 7. 거래량, 거래량 20일 이동평균선 비율

(volume - volume ma20) / volume ma20

In [22]:
volume_ma20 = moving_average(data_678['Volume'])['ma_20']

In [23]:
G=[]
for i in range(len(data_678)-20):    
    G.append(round((data_678['Volume'][i+20] - volume_ma20[i]) / volume_ma20[i] , 3))

In [24]:
G=pd.DataFrame(G)
G.columns=['G']

#### 8. 볼린저 밴드 너비 전달 대비 비율

(close ma20 std *4 ) / last close ma20 std*4

In [25]:
def bollinger_band(data, w=20, k=2):
        
    mbb = data.rolling(w).mean()
    
    ubb = mbb + k * data.rolling(w).std()
    
    lbb = mbb - k * data.rolling(w).std()
    
    result = np.array([data,mbb,ubb,lbb])
    
    df = pd.DataFrame(result).T
    df.columns = ["close", "mbb", "ubb", "lbb"]
    
    return df

In [26]:
BB = bollinger_band(data_678['Close'])
BB['H']=BB['ubb']-BB['lbb']

In [27]:
H=BB[20:]['H'].dropna().reset_index()[['H']]

## 7개의 변수를 dataframe set으로 만들기

In [28]:
data_tech=pd.concat([A,B,C,D,E,F,G,H],axis=1)

In [29]:
data_tech.head()

Unnamed: 0,A,B,C,D,E,F,G,H
0,-0.002,0.004,-0.006,0.002,-0.381,0.045,-0.697,1932.154511
1,0.005,0.004,-0.013,0.01,0.582,0.059,-0.374,1650.079424
2,0.001,0.01,-0.002,-0.009,0.083,0.044,-0.366,1505.325284
3,-0.003,0.014,-0.001,-0.017,0.376,0.02,-0.064,1483.756257
4,0.017,0.007,-0.004,0.018,-0.19,0.032,-0.31,1521.417621


In [30]:
data_tech.to_csv('data/data_tech.csv')