> TOC
```
Step 1. 데이터 불러오기
Step 2. 예측값 정의
Step 3. 모델 인스턴스 생성 및 파이프라인 정의
Step 4. 모델링 및 평가
     4-1. 기본 데이터 모델링
     4-2. 차분 데이터 모델링
     4-3. 로그차분 데이터 모델링
```

## Step 1. 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np

import yfinance as yf

import matplotlib.pyplot as plt
plt.style.use('seaborn')

import warnings
warnings.filterwarnings(action='ignore')


In [2]:
stocks = yf.Ticker('TSLA')

In [3]:
df = stocks.history(start='2010-01-01', end='2022-07-08')
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983,31533500,0,0.0
2022-07-01,681.000000,690.690002,666.359985,681.789978,24781500,0,0.0
2022-07-05,669.000000,699.440002,648.500000,699.200012,28193700,0,0.0
2022-07-06,692.340027,703.690002,681.559998,695.200012,23951200,0,0.0


## Step 2. 예측값 정의

- `up` : 전일 대비 종가 상승 여부

In [4]:
df['up'] = [1 if x >= 0 else 0 for x in df['Close'].diff()]
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,up
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0,0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0,0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0,0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0,0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0,0
...,...,...,...,...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983,31533500,0,0.0,0
2022-07-01,681.000000,690.690002,666.359985,681.789978,24781500,0,0.0,1
2022-07-05,669.000000,699.440002,648.500000,699.200012,28193700,0,0.0,1
2022-07-06,692.340027,703.690002,681.559998,695.200012,23951200,0,0.0,0


In [5]:
df['Close'].diff()

Date
2010-06-29          NaN
2010-06-30    -0.012000
2010-07-01    -0.374000
2010-07-02    -0.552000
2010-07-06    -0.618000
                ...    
2022-06-30   -12.049988
2022-07-01     8.369995
2022-07-05    17.410034
2022-07-06    -4.000000
2022-07-07    38.429993
Name: Close, Length: 3027, dtype: float64

In [6]:
df = df[['Open','High','Low','Close','Volume','up']]
df.columns = ['open','high','low','close','volume','up']
df

Unnamed: 0_level_0,open,high,low,close,volume,up
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0
...,...,...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983,31533500,0
2022-07-01,681.000000,690.690002,666.359985,681.789978,24781500,1
2022-07-05,669.000000,699.440002,648.500000,699.200012,28193700,1
2022-07-06,692.340027,703.690002,681.559998,695.200012,23951200,0


In [7]:
X = df[['open','high','low','close','volume']]
y = df['up']

In [8]:
X

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500
...,...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983,31533500
2022-07-01,681.000000,690.690002,666.359985,681.789978,24781500
2022-07-05,669.000000,699.440002,648.500000,699.200012,28193700
2022-07-06,692.340027,703.690002,681.559998,695.200012,23951200


In [9]:
y

Date
2010-06-29    0
2010-06-30    0
2010-07-01    0
2010-07-02    0
2010-07-06    0
             ..
2022-06-30    0
2022-07-01    1
2022-07-05    1
2022-07-06    0
2022-07-07    1
Name: up, Length: 3027, dtype: int64

## Step 3. 모델 인스턴스 생성 및 파이프라인 정의

In [10]:
# model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# method
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [11]:
lr = LogisticRegression()
rfc = RandomForestClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()

In [12]:
def show_ml_acc(X, y, model, diff=False):
    if not diff:
        X_train, X_test, y_train, y_test = train_test_split(X.iloc[:-1], 
                                                            y.iloc[1:], 
                                                            test_size=0.2, 
                                                            shuffle=True, 
                                                            random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X.iloc[1:-1], 
                                                            y.iloc[2:], 
                                                            test_size=0.2, 
                                                            shuffle=True, 
                                                            random_state=42)
    
    print(f'Model : {model.__class__.__name__}')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = (y_pred==y_test).sum()/len(y_pred)
    print(f'Accuracy : {acc:.2f}')
    print(classification_report(y_test, y_pred))
    return acc

## Step 4. 모델링 및 평가

### 4-1. 기본 데이터

In [13]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(X, y, model, diff=False))
print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.51
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       298
           1       0.51      1.00      0.67       308

    accuracy                           0.51       606
   macro avg       0.25      0.50      0.34       606
weighted avg       0.26      0.51      0.34       606

Model : RandomForestClassifier
Accuracy : 0.52
              precision    recall  f1-score   support

           0       0.52      0.48      0.50       298
           1       0.53      0.57      0.55       308

    accuracy                           0.52       606
   macro avg       0.52      0.52      0.52       606
weighted avg       0.52      0.52      0.52       606

Model : XGBClassifier
Accuracy : 0.50
              precision    recall  f1-score   support

           0       0.49      0.40      0.44       298
           1       0.51      0.59      0.54       308

    accuracy                           0.50       606


logistic regression 처럼 precision과 recall이 극명히 차이나는 경우 테스크에 적합하지 않은 모델일 수 있음. 예측값을 보면 대부분 1(상승)으로 예측.  
그렇기 때문에 1(상승)에 대한 recall이 97%나 되는 것.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X.iloc[:-1], y.iloc[1:], test_size=0.2, shuffle=True, random_state=42)
lr.fit(X_train, y_train)

LogisticRegression()

In [15]:
lr.predict_proba(X_test)

array([[0.49388944, 0.50611056],
       [0.49688264, 0.50311736],
       [0.43570036, 0.56429964],
       ...,
       [0.48676914, 0.51323086],
       [0.48193862, 0.51806138],
       [0.4525924 , 0.5474076 ]])

In [16]:
lr.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [17]:
y_test.values

array([0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,

### 4-2. 차분 데이터

In [19]:
X_diff = X.diff()
X_diff

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-29,,,,,
2010-06-30,1.358000,1.084000,1.152000,-0.012000,-7896000.0
2010-07-01,-0.158000,-0.900000,-0.606000,-0.374000,-44841500.0
2010-07-02,-0.400000,-0.564000,-0.312000,-0.552000,-15395000.0
2010-07-06,-0.600000,-0.620000,-0.576000,-0.618000,8635500.0
...,...,...,...,...,...
2022-06-30,-17.969971,-5.150024,-10.229980,-12.049988,3901100.0
2022-07-01,7.469971,2.320007,9.769958,8.369995,-6752000.0
2022-07-05,-12.000000,8.750000,-17.859985,17.410034,3412200.0
2022-07-06,23.340027,4.250000,33.059998,-4.000000,-4242500.0


In [20]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(X_diff, y, model, diff=True))
print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.48
              precision    recall  f1-score   support

           0       0.44      0.49      0.46       276
           1       0.53      0.47      0.50       329

    accuracy                           0.48       605
   macro avg       0.48      0.48      0.48       605
weighted avg       0.49      0.48      0.48       605

Model : RandomForestClassifier
Accuracy : 0.52
              precision    recall  f1-score   support

           0       0.47      0.50      0.49       276
           1       0.56      0.53      0.54       329

    accuracy                           0.52       605
   macro avg       0.51      0.51      0.51       605
weighted avg       0.52      0.52      0.52       605

Model : XGBClassifier
Accuracy : 0.53
              precision    recall  f1-score   support

           0       0.49      0.53      0.51       276
           1       0.58      0.53      0.55       329

    accuracy                           0.53       605


### 4-3. 로그차분 데이터

In [21]:
import numpy as np

X_log_diff = np.log(X).diff()
X_log_diff

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-29,,,,,
2010-06-30,0.305548,0.196224,0.283969,-0.002515,-0.087904
2010-07-01,-0.031111,-0.160085,-0.139311,-0.081723,-0.737735
2010-07-02,-0.083382,-0.115182,-0.080084,-0.134312,-0.469410
2010-07-06,-0.139762,-0.144100,-0.167151,-0.175470,0.289699
...,...,...,...,...,...
2022-06-30,-0.026331,-0.007454,-0.015460,-0.017736,0.132061
2022-07-01,0.011030,0.003365,0.014770,0.012352,-0.240953
2022-07-05,-0.017778,0.012589,-0.027168,0.025215,0.129001
2022-07-06,0.034293,0.006058,0.049722,-0.005737,-0.163080


In [22]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(X_log_diff, y, model, diff=True))
print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.55
              precision    recall  f1-score   support

           0       0.51      0.17      0.25       276
           1       0.55      0.86      0.67       329

    accuracy                           0.55       605
   macro avg       0.53      0.52      0.46       605
weighted avg       0.53      0.55      0.48       605

Model : RandomForestClassifier
Accuracy : 0.51
              precision    recall  f1-score   support

           0       0.46      0.49      0.47       276
           1       0.55      0.53      0.54       329

    accuracy                           0.51       605
   macro avg       0.51      0.51      0.51       605
weighted avg       0.51      0.51      0.51       605

Model : XGBClassifier
Accuracy : 0.52
              precision    recall  f1-score   support

           0       0.47      0.45      0.46       276
           1       0.56      0.58      0.57       329

    accuracy                           0.52       605
