# 기본 패키지 설정

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import collections
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier # 앙상블 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report # 정오분류표
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score 
from sklearn.metrics import roc_curve, auc, roc_auc_score,classification_report,confusion_matrix  # ROC곡선 그리기
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import learning_curve, validation_curve # 학습곡선, 검증곡선
from sklearn.model_selection import GridSearchCV, cross_val_score  # 하이퍼파라미터 튜닝, 교차타당도
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# 데이터 읽기

In [2]:
naver_df = pd.read_csv("../../data/baseline_data.csv", 
                       index_col=['Date'], parse_dates=True)
hyundai_df = pd.read_csv("../../data/HyundaiMtr.csv", 
                       index_col=['Date'], parse_dates=True)
LG_df = pd.read_csv("../../data/LGCHEM.csv", 
                       index_col=['Date'], parse_dates=True)
Samsung_df = pd.read_csv("../../data/samsungElec.csv", 
                       index_col=['Date'], parse_dates=True)
amore_df = pd.read_csv("../../data/AmorePacific.csv", 
                       index_col=['Date'], parse_dates=True)

In [3]:
def labeling(origin_df):
    n_list = [1,3,5,7,10,20,30,60,90]
    for n in n_list:
        col_name = 'next_price'+str(n)
        origin_df[col_name] = origin_df['Close'].shift(-n)
        diffs = origin_df[col_name]-origin_df['Close']
        label_name = 'label'+str(n)
        origin_df[label_name] = np.where(diffs>0,1,0)
        
        df = origin_df['2011-01-01':'2020-12-31']
        df.dropna(inplace=True)
    return df

In [4]:
naver_df = labeling(naver_df)
hyundai_df = labeling(hyundai_df)
LG_df = labeling(LG_df)
Samsung_df = labeling(Samsung_df)
amore_df = labeling(amore_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


# Prediction

## Column List

In [5]:
feature1_list = ['Open','High','Low','Adj Close','Volume','log_return','Close','next_rtn']
feature2_list = ['RASD5','RASD10','ub','lb','CCI','ATR','MACD','MA5','MA10','MTM1','MTM3','ROC','WPR','middle']
feature3_list = ['S&P500', 'SOX', 'VIX','KOSPI']
feature4_list = ['next_price']
all_x_feature = feature1_list+feature2_list+feature3_list

## K-Fold

In [6]:
# 스태킹 함수
def get_stacking_datasets(model, Xtrain, ytrain, Xtest, n_folds=5):
  # cv하기 위해 K-fold 설정
  kfold = KFold(n_splits=n_folds, shuffle=True, random_state=1234)

  # 최종 메타 모델이 사용할 학습 데이터 반환을 위해서 넘파이 배열을 0으로 만들어서 초기화
  train_fold_pred = np.zeros((Xtrain.shape[0], 1)) # 2차원
  test_pred = np.zeros((Xtest.shape[0], n_folds))
  #print('model: ', model.__class__.__name__)

  for cnt, (train_index, valid_index) in enumerate(kfold.split(Xtrain)):
    # 개별 모델 내부에서 학습하고 1개의 fold로 예측할 데이터셋 추출
    #print(f" Fold 횟수 : {cnt+1}")
    X_tr = Xtrain[train_index]
    y_tr = ytrain[train_index]
    X_validation = Xtrain[valid_index]

    # 학습
    model.fit(X_tr, y_tr)
    # 1개의 fold데이터셋으로 예측값 반환 후 최종 메타모델이 학습할 데이터셋에 첨가
    train_fold_pred[valid_index, :] = model.predict(X_validation).reshape(-1, 1)
    # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터(Xtest)를 이용해서 예측을 수행하고 저장
    test_pred[:, cnt] = model.predict(Xtest)

  # 개별 모델 안에서 테스트 데이터셋을 기반으로 예측한 결과값들 mean 취해주고 2차원으로 바꾸기
  test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

  return train_fold_pred, test_pred_mean

## Stratified Fold

In [7]:
#스태킹 함수
def get_stacking_datasets2(model, X_train, y_train, X_test, n_folds=5):
  skf = StratifiedKFold(n_splits=n_folds)

  # 최종 메타 모델이 사용할 학습 데이터 반환을 위해서 넘파이 배열을 0으로 만들어서 초기화
  train_fold_pred2 = np.zeros((X_train.shape[0], 1))

  test_pred2 = np.zeros((X_test.shape[0], n_folds))
  #print('model: ', model.__class__.__name__)

  for cnt, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    # 개별 모델 내부에서 학습하고 1개의 fold로 예측할 데이터셋 추출
    #print(f" Fold 횟수 : {cnt+1}")
    X_tr2 = X_train[train_index]
    y_tr2 = y_train[train_index] 
    X_validation = X_train[valid_index]

    # 학습
    model.fit(X_tr2, y_tr2)
    # 1개의 fold데이터셋으로 예측값 반환 후 최종 메타모델이 학습할 데이터셋에 첨가
    train_fold_pred2[valid_index,:] = model.predict(X_validation).reshape(-1, 1)
     # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터(X_test)를 이용해서 예측을 수행하고 저장
    test_pred2[:, cnt] = model.predict(X_test)

  test_pred_mean2 = np.mean(test_pred2, axis=1).reshape(-1, 1)
  return train_fold_pred2, test_pred_mean2

# Prediction

In [8]:
def predict_trend(label_list,df_list,stock_list):
    result = pd.DataFrame(
                      index=['ADA', 'LGBM','XGB','RF',
                               'Staking','KFold-Staking','StratifiedKFold'])

    for df,stock_name in zip(df_list,stock_list):
        print('-----',stock_name,'시작합니다.-----')
        for n in label_list:

            if set(df.columns[:27]) == set(all_x_feature + feature4_list):
                X = df[all_x_feature]

                # Min Max Scaler를 사용해 스케일링
                scaler = MinMaxScaler()
                scaler.fit(X)
                X_scaled = scaler.transform(X)

                label_name = 'label'+str(n)
                y = df[label_name]

                # train test split
                X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, 
                                                                    test_size=0.2, 
                                                                    random_state=1234, 
                                                                    stratify=y)

                print('Lable'+str(n)+' 양성비율','train : %.3f' %(sum(y_train)/len(y_train)),end=' ')
                print('test : %.3f' %(sum(y_test)/len(y_test)))

                #tree
                tree = DecisionTreeClassifier(max_depth=4,
                                              criterion='entropy',
                                              random_state=1234)
                # Ada
                adaboost = AdaBoostClassifier(base_estimator=tree, 
                                        n_estimators=100,
                                        learning_rate = 0.01, 
                                        random_state=1234)
                # lgb
                lgb = LGBMClassifier(random_state=1234)

                # xgb
                xgb = XGBClassifier(tree_method='hist',random_state=41)

                # rf
                forest = RandomForestClassifier(criterion='entropy',
                                                n_estimators=700,
                                                random_state=1234)
                # stacking


                all_clf = [adaboost,lgb,xgb,forest]
                clf_label = ['ADA', 'LGBM','XGB','RF']

                pred = []
                stack_final_x_train = []
                stack_final_x_train2 = []
                stack_final_x_test = []
                stack_final_x_test2 = []
                acc = []

                for clf,label in zip(all_clf,clf_label):
                    clf.fit(X_train,y_train)
                    y_pred = clf.predict(X_test)

                    pred.append(y_pred)
                    acc.append(round(accuracy_score(y_test, y_pred),2))
                    clf_train, clf_test = get_stacking_datasets(clf,X_train,y_train, X_test)
                    stack_final_x_train.append(clf_train)
                    stack_final_x_test.append(clf_test)

                    clf_train2, clf_test2 = get_stacking_datasets2(clf,X_train,y_train, X_test)
                    stack_final_x_train2.append(clf_train2)
                    stack_final_x_test2.append(clf_test2)


                # Staking
                pred = np.transpose(pred)
                lr_final = LogisticRegression(C=10)
                lr_final.fit(pred, y_test)
                final = lr_final.predict(pred)

                acc.append(round(accuracy_score(final, y_test),2))


                # KFold
                stack_final_x_train = np.concatenate(tuple(stack_final_x_train), axis=1)
                stack_final_x_test = np.concatenate(tuple(stack_final_x_test), axis=1)
                lr_final = LogisticRegression(C=10)
                lr_final.fit(stack_final_x_train, y_train)
                stack_final_pred = lr_final.predict(stack_final_x_test)
                acc.append(round(accuracy_score(stack_final_pred, y_test),2))

                # Stratified KFold
                stack_final_x_train2 = np.concatenate(tuple(stack_final_x_train2), axis=1)
                stack_final_x_test2 = np.concatenate(tuple(stack_final_x_test2), axis=1)
                lr_final = LogisticRegression(C=10)
                lr_final.fit(stack_final_x_train2, y_train)
                stack_final_pred2 = lr_final.predict(stack_final_x_test2)
                acc.append(round(accuracy_score(stack_final_pred2, y_test),2))

                col_name = stock_name + str(n)
                result[col_name] = acc

    return result

In [9]:
label_list= [1,7,30,90]
df_list = [naver_df,hyundai_df,LG_df,Samsung_df,amore_df]
stock_list = ['naver','hyundai','LG','Samsung','amore']

In [10]:

acc_df = predict_trend(label_list,df_list,stock_list)

----- naver 시작합니다.-----
Lable1 양성비율 train : 0.480 test : 0.480


KeyboardInterrupt: 

In [None]:
acc_df

In [None]:
col_list = [stock + str(label) for label in label_list for stock in stock_list]
acc_df = acc_df[col_list].T

In [None]:
acc_df

In [None]:
from datetime import datetime
day = datetime.today().day
Path = "../../data/stacking_result"+str(day)+".xlsx" 
acc_df.to_excel(Path)