# 03_ 주가 관련 보조지표를 활용하여 등락 예측_카카오

In [1]:
# 기본적으로 필요한 라이브러리
import numpy as np
import pandas as pd


#시각화 라이브러리
import matplotlib.pyplot as plt 
import seaborn as sns

#datetime 형식으로 바꿔주는 라이브러리
from datetime import datetime

In [2]:
stock = pd.read_csv('kakao.csv')
stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2020-09-01,82700,83200,80000,80300,5503680
1,2020-09-02,81100,82500,80300,82400,4608275
2,2020-09-03,83100,83200,80800,82000,3696075
3,2020-09-04,78000,80900,77900,80400,5842645
4,2020-09-07,80300,80300,78200,78400,4644695


In [3]:
stock = stock.set_index('Date') # date를 index로 설정
stock = stock.loc['2020-09-01' : '2021-06-30']

## 1.1 머신러닝 준비

In [4]:
# 종가 기준 fluctuation 추가
def up_down(x):
    if x >= 0:
        return 1
    else :
        return 0

stock['fluctuation'] = (stock['Close'].shift(-1)-stock['Close']).apply(up_down)
stock.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,fluctuation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-09-01,82700,83200,80000,80300,5503680,1
2020-09-02,81100,82500,80300,82400,4608275,0
2020-09-03,83100,83200,80800,82000,3696075,0
2020-09-04,78000,80900,77900,80400,5842645,0
2020-09-07,80300,80300,78200,78400,4644695,0


In [5]:
stock_train = stock.iloc[:-1]
target = stock_train['fluctuation']
stock_train = stock_train.drop('fluctuation', axis = 1)

from sklearn.model_selection import train_test_split
train, test, train_target, test_target = train_test_split(
    stock_train, target, test_size=0.3, shuffle=False)

## 1.2 모델링

In [6]:
# Importing Classifier Modules
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.linear_model import LogisticRegression   # logistic
from sklearn.neighbors import KNeighborsClassifier   # knn
from sklearn.naive_bayes import GaussianNB   # naive
from lightgbm import LGBMClassifier   # lightGBM
from sklearn import datasets
from sklearn import metrics

In [7]:
# 모형 생성
import lightgbm as lgb
logistic = LogisticRegression()
knn = KNeighborsClassifier()
naive = GaussianNB()
lgb = lgb.LGBMClassifier()

models = [{'name' : 'Logistic', 'model' : logistic}, {'name' : 'KNN', 'model' : knn},
          {'name' : 'lgb.LGBM', 'model' : lgb}, {'name' : 'NaiveBayes', 'model' : naive}]

for m in models : 
    model = m['model']
    model.fit(train, train_target)
    
    predicted = model.predict(test)

#Accuracy : 전체 샘플 중 맞게 예측한 샘플수의 비율
#Precision(정밀도) : postive라고 예측한 것 중에서 실제 postive인 것
#Recall(재현율) : 실제 postive중에 예측한 postive 비율
    print ('model name : {}'.format(m['name']))
    print (metrics.classification_report(test_target, predicted))

#confusion_matrix에서
#행은 실제값, 열은 예측한 값으로 0 1 순서대로 임
    print('Confusion Matrix') 
    print (metrics.confusion_matrix (test_target, predicted))

    print ('Accuracy Score : {:.4f}\n'.format(metrics.accuracy_score(test_target, predicted)))

model name : Logistic
              precision    recall  f1-score   support

           0       0.19      0.30      0.24        20
           1       0.55      0.40      0.47        42

    accuracy                           0.37        62
   macro avg       0.37      0.35      0.35        62
weighted avg       0.43      0.37      0.39        62

Confusion Matrix
[[ 6 14]
 [25 17]]
Accuracy Score : 0.3710

model name : KNN
              precision    recall  f1-score   support

           0       0.16      0.20      0.18        20
           1       0.57      0.50      0.53        42

    accuracy                           0.40        62
   macro avg       0.36      0.35      0.35        62
weighted avg       0.44      0.40      0.42        62

Confusion Matrix
[[ 4 16]
 [21 21]]
Accuracy Score : 0.4032

model name : lgb.LGBM
              precision    recall  f1-score   support

           0       0.40      0.20      0.27        20
           1       0.69      0.86      0.77        42


In [8]:
def rate_of_return():
    df['percent'] = round((df.Close-df.Close.shift(1))/df.Close.shift(1)*100, 2)
    for i in range(len(df)-1):
        if (df.loc[i, 'predicted'] == 0):
            df.loc[i+1, 'percent'] = df.loc[i+1, 'percent']

In [9]:
for m in models : 
    model = m['model']
    model.fit(train, train_target)
    
    predicted = model.predict(test)
    
    df = pd.concat([test.reset_index().drop('Date', axis=1), pd.DataFrame(predicted, columns = ['predicted'])], axis=1)
    
    rate_of_return()
    
    df.dropna(inplace = True)

    print('model name : {}'.format(m['name']))
    print('첫날을 제외한 거래일수 : {}'.format(len(df)))
    print('누적 수익률 : {}'.format(round(df['percent'].sum(), 2)))
    print('1일 평균 수익률 : {}\n'.format(round(df['percent'].sum()/(len(df)-1),2)))

model name : Logistic
첫날을 제외한 거래일수 : 61
누적 수익률 : 48.11
1일 평균 수익률 : 0.8

model name : KNN
첫날을 제외한 거래일수 : 61
누적 수익률 : 48.11
1일 평균 수익률 : 0.8

model name : lgb.LGBM
첫날을 제외한 거래일수 : 61
누적 수익률 : 48.11
1일 평균 수익률 : 0.8

model name : NaiveBayes
첫날을 제외한 거래일수 : 61
누적 수익률 : 48.11
1일 평균 수익률 : 0.8



## 1.3 주가 관련 보조지표를 활용하여 등락 예측

In [10]:
# StockInsider 패키지를 사용해 여러가지 주가 관련 보조지표 산출
# 하지만 중국에서 개발된 패키지라 한국에 상장된 종목 코드는 등록되어 있지 않다.
# from_external_csv_data 함수를 이용해 코스피 종목의 주식데이터를 받아서 StockInsider 패키지에 넣어줘야 한다.

def get_stock(code):
    import FinanceDataReader as fdr
    from insider import StockInsider
    import os

    df=fdr.DataReader("code")
    df["day"]=df.index
    df.columns=['open', 'close','high', 'low', 'volumn', "price_change",'day']

    df["ma5"]=df["close"].rolling(5).mean()
    df["ma10"]=df["close"].rolling(10).mean()
    df["ma20"]=df["close"].rolling(20).mean()
    df["v_ma5"]=df["volumn"].rolling(5).mean()
    df["v_ma10"]=df["volumn"].rolling(10).mean()
    df["v_ma20"]=df["volumn"].rolling(20).mean()
    df["percent_change"]=df["close"].pct_change()

    
    if os.path.exists("./data")==False:

        os.mkdir("./data")

    df.to_csv("./data/{}.csv".format(code))
    fpath = "./data/{}.csv".format(code)
    si = StockInsider.from_external_csv_data(fpath=fpath, code=code)

    return si
    
# 주사 데이터는 FinanceDataReader 패키지로 가져오고, 전처리를 후에 csv파일로 저장한다.
# 이 파일을 이용해서 StockInsider 객체를 만든다

In [11]:
import FinanceDataReader as fdr
si=get_stock("03572")
si.plot()

In [12]:
# 이동평균선
def ma(df):
    #이동평균 칼럼 추가 당일 포함 3일 7일 15일 30일
    df['MA_3'] = df['Close'].rolling(window=3).mean() 
    df['MA_7'] = df['Close'].rolling(window=7).mean()
    df['MA_15'] = df['Close'].rolling(window=15).mean() 
    df['MA_30'] = df['Close'].rolling(window=30).mean()

In [13]:
# 지수이동평균선
def ema(df, day):
    df['EWM_{}'.format(day)] = df['Close'].ewm(span=day).mean()
    df['EWM_{}'.format(day)] = df['Close'].ewm(span=day).mean()

In [14]:
ema(stock, 15)

In [15]:
# 이격도 (PPO : Percentage Price oscillator)
def ppo(df, day):
    df['PPO_{}'.format(day)] = (df['Close']/df['MA_{}'.format(day)])*100

In [16]:
# RSI
def U(x):
    if x >= 0:
        return x
    else :
        return 0
    
def D(x):
    if x <= 0:
        return x*(-1)
    else :
        return 0

def rsi(df):
    df['diff_rsi'] = (df['Close'].shift(1)-df['Close'])
    df['AU'] = df['diff_rsi'].apply(U).rolling(window=14).mean() 
    df['AD'] = df['diff_rsi'].apply(D).rolling(window=14).mean() 
    df['RSI'] = df['AU']/(df['AU']+df['AD'])

In [17]:
# rsi(stock)
# stock.drop(['diff_rsi', 'AU', 'AD'], axis=1, inplace = True)

In [18]:
# 모멘텀 스토캐스틱 %K, %D, Fast, Slow
def high_low(day):
    global stock
    stock = stock.reset_index()
    for i in range(len(stock)-day+1):
        stock.loc[i, 'High_st']= stock[i:i+day]['High'].max()
        stock.loc[i, 'Low_st']= stock[i:i+day]['Low'].min()
        
    stock['High_st_4'] = stock['High_st'].shift(4)
    stock['Low_st_4'] = stock['Low_st'].shift(4)
    
    stock['fast_K'] = (stock['Close']-stock['Low_st_4'])/(stock['High_st_4']-stock['Low_st_4'])
    stock['fast_D'] = stock['fast_K'].rolling(3).mean()
    stock['slow_K'] = stock['fast_D']
    stock['slow_D'] = stock['slow_K'].rolling(3).mean()
    stock = stock.set_index('Date')

In [19]:
stock['High_st'] = np.nan
stock['Low_st'] = np.nan
high_low(5)

In [20]:
# CCI
def CCI(df):
    #CCI = (M-N) / (0.015*D)
    # M=특정일의 고가,저가, 종가의 평균
    # N = 일정기간동안의 단순이동평균 통상적으로 20일로 사용
    # D = M-N의 일정기간동안의 단순이동평균
    M = ((df.High)+(df.Low)+(df.Close)) / 3
    N = M.rolling(20).mean()
    D = (M-N).rolling(20).mean()
    CCI = (M - N)/ (0.015 * D)
    stock['CCI'] = CCI

In [21]:
# mackd
def macd(df, short=12, long=26, t=9):
    
    ma_12=df.Close.ewm(span=12).mean()
    ma_26 = df.Close.ewm(span=26).mean() # 장기(26) EMA
    macd = ma_12 - ma_26 # MACD
    macdSignal = macd.ewm(span=9).mean() # Signal
    macdOscillator = macd - macdSignal # Oscillator
    stock['macd'] = macdOscillator

In [22]:
macd(stock)

In [23]:
stock = stock.dropna()

## 1.4 적정 보조지표 찾기

In [24]:
from sklearn import datasets
from sklearn import metrics

# 분류모형
import lightgbm as lgb
logistic = LogisticRegression()
knn = KNeighborsClassifier()
naive = GaussianNB()
lgb = lgb.LGBMClassifier()

models = [{'name' : 'Logistic', 'model' : logistic}, {'name' : 'KNN', 'model' : knn},
          {'name' : 'lgb.LGBM', 'model' : lgb}, {'name' : 'NaiveBayes', 'model' : naive}]

In [25]:
while (True):
    index = []
    i = 0
    
    if i == 0:
        stock_train = stock.iloc[:-1]
        target = stock_train['fluctuation']
        stock_train = stock_train.drop('fluctuation', axis = 1)

        train, test, train_target, test_target = train_test_split(
            stock_train, target, test_size=0.3, shuffle=False)
        
        accuracy = []
        
        for m in models:
            model = m['model']
            model.fit(train, train_target)
    
            predicted = model.predict(test)
    
            score = metrics.accuracy_score(test_target, predicted)
            accuracy.append(score)
        
        init_score = np.mean(accuracy)
        
        print(i, ' iteration, initial score : {}'.format(round(init_score, 4)))
        
        i += 1    
        
    if i == 1:
        stock_2 = stock.copy()
        
        index2 = [ma, ema, ppo]
        
        for dex in index2:
            if dex == ma:
                dex(stock_2)
            if dex == ema:
                dex(stock_2, 15)
                dex(stock_2, 30)
            if dex == ppo:
                dex(stock_2, 15)
                dex(stock_2, 30)
        
        stock_2 = stock_2.dropna()
        stock_2 = stock_2.iloc[len(stock_2)-160:]
        
        today = stock_2.iloc[-1]
        stock_2_df = stock_2.iloc[:-1]
        target = stock_2_df['fluctuation']
        stock_2_df = stock_2_df.drop('fluctuation', axis = 1)

        train, test, train_target, test_target = train_test_split(stock_2_df, target, test_size = 0.3, shuffle=False )
        
        accuracy = []
        
        for m in models:
            
            model = m['model']
            model.fit(train, train_target)
    
            predicted = model.predict(test)
    
            score = metrics.accuracy_score(test_target, predicted)
            accuracy.append(score)
        
        accuracy_score = np.mean(accuracy)
        
        print(i, ' iteration, Accuracy score : {}'.format(round(accuracy_score, 4)))
        
        if init_score <= accuracy_score:
            index = index2
            
        i += 1
        
    
    if i == 2:
        stock_2 = stock.copy()
        
        rsi(stock_2)
        stock_2.drop(['diff_rsi', 'AU', 'AD'], axis=1, inplace = True)
        
        stock_2 = stock_2.dropna()
        stock_2 = stock_2.iloc[len(stock_2)-160:]
        
        today = stock_2.iloc[-1]
        stock_2_df = stock_2.iloc[:-1]
        target = stock_2_df['fluctuation']
        stock_2_df = stock_2_df.drop('fluctuation', axis = 1)

        train, test, train_target, test_target = train_test_split(
            stock_2_df, target, test_size = 0.3, shuffle=False )
        
        accuracy = []
        
        for m in models:
            
            model = m['model']
            model.fit(train, train_target)
    
            predicted = model.predict(test)
    
            score = metrics.accuracy_score(test_target, predicted)
            accuracy.append(score)
        
        accuracy_score = np.mean(accuracy)
        
        print(i, ' iteration, Accuracy score : {}'.format(round(accuracy_score, 4)))
        
        if init_score <= accuracy_score:
            index.append(rsi)
        
        i += 1
        
    if i == 3:
        stock_2 = stock.copy()
        
        stock_2['High_st'] = np.nan
        stock_2['Low_st'] = np.nan
        high_low(5)
        stock_2 = stock_2.drop(['High_st', 'Low_st', 'High_st_4', 'Low_st_4', 'fast_K', 'fast_D'], axis = 1)
        
        stock_2 = stock_2.dropna()
        stock_2 = stock_2.iloc[len(stock_2)-160:]
        
        today = stock_2.iloc[-1]
        stock_2_df = stock_2.iloc[:-1]
        target = stock_2_df['fluctuation']
        stock_2_df = stock_2_df.drop('fluctuation', axis = 1)

        train, test, train_target, test_target = train_test_split(
            stock_2_df, target, test_size = 0.3, shuffle=False )
        
        accuracy = []
        
        for m in models:
            
            model = m['model']
            model.fit(train, train_target)
    
            predicted = model.predict(test)
    
            score = metrics.accuracy_score(test_target, predicted)
            accuracy.append(score)
        
        accuracy_score = np.mean(accuracy)
        
        print(i, ' iteration, Accuracy score : {}'.format(round(accuracy_score, 4)))
        
        if init_score <= accuracy_score:
            index.append(high_low)
           
        i += 1
        
        
    if i == 4:
        stock_2 = stock.copy()
        
        CCI(stock_2)
        
        stock_2 = stock_2.dropna()
        stock_2 = stock_2.iloc[len(stock_2)-160:]
        
        today = stock_2.iloc[-1]
        stock_2_df = stock_2.iloc[:-1]
        target = stock_2_df['fluctuation']
        stock_2_df = stock_2_df.drop('fluctuation', axis = 1)

        train, test, train_target, test_target = train_test_split(
            stock_2_df, target, test_size = 0.3, shuffle=False )
        
        accuracy = []
        
        for m in models:
            
            model = m['model']
            model.fit(train, train_target)
    
            predicted = model.predict(test)
    
            score = metrics.accuracy_score(test_target, predicted)
            accuracy.append(score)
        
        accuracy_score = np.mean(accuracy)
        
        print(i, ' iteration, Accuracy score : {}'.format(round(accuracy_score, 4)))
        
        if init_score <= accuracy_score:
            index.append(CCI)

        break

0  iteration, initial score : 0.5905
1  iteration, Accuracy score : 0.6406
2  iteration, Accuracy score : 0.6146
3  iteration, Accuracy score : 0.5938
4  iteration, Accuracy score : 0.6198


In [None]:
# # stock_2 = stock.copy()
        
# #     for dex in index:
# #         if dex == ma:
# #             dex(stock_2)
# #         if dex == ema:
# #             dex(stock_2, 15)
# #             dex(stock_2, 30)
# #         if dex == ppo:
# #             dex(stock_2, 15)
# #             dex(stock_2, 30)
# #         if dex == rsi:
# #             dex(stock_2)
# #             stock_2.drop(['diff_rsi', 'AU', 'AD'], axis=1, inplace = True)
# #         if dex == high_low:
# #             stock_2['high_st'] = np.nan
# #             stock_2['low_st'] = np.nan
# #             dex(5)
# #             stock_2.drop(['high_st', 'low_st', 'high_st_4', 'low_st_4', 'fast_K', 'fast_D'], axis = 1, inplace = True)
# #         if dex == CCI:
# #             dex(stock_2)
    
        
# #     stock_2 = stock_2.dropna()
# #     stock_2 = stock_2.iloc[len(stock_2)-160:]
        
# #     today = stock_2.iloc[-1]
# #     stock_2_df = stock_2.iloc[:-1]
# #     target = stock_2_df['fluctuation']
# #     stock_2_df = stock_2_df.drop('fluctuation', axis = 1)

# #     train, test, train_target, test_target = train_test_split(stock_2_df, target, test_size = 0.3, shuffle=False )
        
# #     accuracy = []
        
# #     for m in models:
            
# #         model = m['model']
# #         model.fit(train, train_target)
    
# #         predicted = model.predict(test)
    
# #         score = metrics.accuracy_score(test_target, predicted)
# #         accuracy.append(score)
        
# #     accuracy_score = np.mean(accuracy)
        
# #     print('Final iteration, score : {}'.format(round(accuracy_score, 4)))
# #     break

In [27]:
# 분류모형
import lightgbm as lgb
logistic = LogisticRegression()
knn = KNeighborsClassifier()
naive = GaussianNB()
lgb = lgb.LGBMClassifier()

models = [{'name' : 'Logistic', 'model' : logistic}, {'name' : 'KNN', 'model' : knn},
          {'name' : 'lgb.LGBM', 'model' : lgb}, {'name' : 'NaiveBayes', 'model' : naive}]


for m in models : 
    model = m['model']
    model.fit(train, train_target)
    
    predicted = model.predict(test)

#Accuracy : 전체 샘플 중 맞게 예측한 샘플수의 비율
#Precision(정밀도) : postive라고 예측한 것 중에서 실제 postive인 것
#Recall(재현율) : 실제 postive중에 예측한 postive 비율
    print ('model name : {}'.format(m['name']))
    print (metrics.classification_report(test_target, predicted))

#confusion_matrix에서
#행은 실제값, 열은 예측한 값으로 0 1 순서대로 임
    print('Confusion Matrix') 
    print (metrics.confusion_matrix (test_target, predicted))

    print ('Accuracy Score : {:.4f}\n'.format(metrics.accuracy_score(test_target, predicted)))

model name : Logistic
              precision    recall  f1-score   support

           0       0.62      0.72      0.67        18
           1       0.81      0.73      0.77        30

    accuracy                           0.73        48
   macro avg       0.72      0.73      0.72        48
weighted avg       0.74      0.73      0.73        48

Confusion Matrix
[[13  5]
 [ 8 22]]
Accuracy Score : 0.7292

model name : KNN
              precision    recall  f1-score   support

           0       0.40      0.33      0.36        18
           1       0.64      0.70      0.67        30

    accuracy                           0.56        48
   macro avg       0.52      0.52      0.52        48
weighted avg       0.55      0.56      0.55        48

Confusion Matrix
[[ 6 12]
 [ 9 21]]
Accuracy Score : 0.5625

model name : lgb.LGBM
              precision    recall  f1-score   support

           0       0.36      0.22      0.28        18
           1       0.62      0.77      0.69        30


In [28]:
for m in models : 
    model = m['model']
    model.fit(train, train_target)
    
    predicted = model.predict(test)
    
    df = pd.concat([test.reset_index().drop('Date', axis=1), pd.DataFrame(predicted, columns = ['predicted'])], axis=1)
    
    rate_of_return()
    
    df.dropna(inplace = True)

    print('model name : {}'.format(m['name']))
    print('첫날을 제외한 거래일수 : {}'.format(len(df)))
    print('누적 수익률 : {}'.format(round(df['percent'].sum(), 2)))
    print('1일 평균 수익률 : {}\n'.format(round(df['percent'].sum()/(len(df)-1),2)))

model name : Logistic
첫날을 제외한 거래일수 : 47
누적 수익률 : 35.31
1일 평균 수익률 : 0.77

model name : KNN
첫날을 제외한 거래일수 : 47
누적 수익률 : 35.31
1일 평균 수익률 : 0.77

model name : lgb.LGBM
첫날을 제외한 거래일수 : 47
누적 수익률 : 35.31
1일 평균 수익률 : 0.77

model name : NaiveBayes
첫날을 제외한 거래일수 : 47
누적 수익률 : 35.31
1일 평균 수익률 : 0.77



## 3. 감성분석 데이터를 포함하여 등락 예측