> TOC
```
Step 1. 데이터 불러오기
Step 2. 예측값 정의
Step 3. 모델 인스턴스 생성 및 파이프라인 정의
Step 4. 모델링 및 평가
     4-1. 기본 데이터 모델링
     4-2. 차분 데이터 모델링
     4-3. 로그차분 데이터 모델링
```

## Step 1. 데이터 불러오기

In [1]:
import pandas as pd
import yfinance as yf

import matplotlib.pyplot as plt
plt.style.use('seaborn')


In [2]:
stocks = yf.Ticker('TSLA')

In [3]:
df = stocks.history(start='2010-01-01', end='2022-07-08')
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983,31533500,0,0.0
2022-07-01,681.000000,690.690002,666.359985,681.789978,24781500,0,0.0
2022-07-05,669.000000,699.440002,648.500000,699.200012,28193700,0,0.0
2022-07-06,692.340027,703.690002,681.559998,695.200012,23951200,0,0.0


In [7]:
df['Close'].diff()

Date
2010-06-29          NaN
2010-06-30    -0.012000
2010-07-01    -0.374000
2010-07-02    -0.552000
2010-07-06    -0.618000
                ...    
2022-06-30   -12.049988
2022-07-01     8.369995
2022-07-05    17.410034
2022-07-06    -4.000000
2022-07-07    38.429993
Name: Close, Length: 3027, dtype: float64

## Step 2. 예측값 정의

- `up` : 전일 대비 종가 상승 여부

In [11]:
df['up'] = [1 if x >= 0 else 0 for x in df['Close'].diff()]
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,up
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0,0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0,0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0,0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0,0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0,0
...,...,...,...,...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983,31533500,0,0.0,0
2022-07-01,681.000000,690.690002,666.359985,681.789978,24781500,0,0.0,1
2022-07-05,669.000000,699.440002,648.500000,699.200012,28193700,0,0.0,1
2022-07-06,692.340027,703.690002,681.559998,695.200012,23951200,0,0.0,0


In [15]:
df = df[['Open','High','Low','Close','up']]
df.columns = ['open','high','low','close','up']
df

Unnamed: 0_level_0,open,high,low,close,up
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,0
2010-06-30,5.158000,6.084000,4.660000,4.766000,0
2010-07-01,5.000000,5.184000,4.054000,4.392000,0
2010-07-02,4.600000,4.620000,3.742000,3.840000,0
2010-07-06,4.000000,4.000000,3.166000,3.222000,0
...,...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983,0
2022-07-01,681.000000,690.690002,666.359985,681.789978,1
2022-07-05,669.000000,699.440002,648.500000,699.200012,1
2022-07-06,692.340027,703.690002,681.559998,695.200012,0


In [16]:
X = df[['open','high','low','close']]
y = df['up']

In [17]:
X

Unnamed: 0_level_0,open,high,low,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000
2010-06-30,5.158000,6.084000,4.660000,4.766000
2010-07-01,5.000000,5.184000,4.054000,4.392000
2010-07-02,4.600000,4.620000,3.742000,3.840000
2010-07-06,4.000000,4.000000,3.166000,3.222000
...,...,...,...,...
2022-06-30,673.530029,688.369995,656.590027,673.419983
2022-07-01,681.000000,690.690002,666.359985,681.789978
2022-07-05,669.000000,699.440002,648.500000,699.200012
2022-07-06,692.340027,703.690002,681.559998,695.200012


In [100]:
y

Date
2010-06-29    0
2010-06-30    0
2010-07-01    0
2010-07-02    0
2010-07-06    0
             ..
2022-06-30    0
2022-07-01    1
2022-07-05    1
2022-07-06    0
2022-07-07    1
Name: up, Length: 3027, dtype: int64

## Step 3. 모델 인스턴스 생성 및 파이프라인 정의

In [101]:
# model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# method
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [84]:
lr = LogisticRegression()
rfc = RandomForestClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()

In [93]:
def show_ml_acc(X, y, model, diff=False):
    if not diff:
        X_train, X_test, y_train, y_test = train_test_split(X.iloc[:-1], y.iloc[1:], test_size=0.2, shuffle=True, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X.iloc[1:-1], y.iloc[2:], test_size=0.2, shuffle=True, random_state=42)
    
    print(f'Model : {model.__class__.__name__}')
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    acc = (pred==y_test).sum()/len(pred)
    print(f'Accuracy : {acc:.2f}')
    print(classification_report(y_test, pred))
    return acc

## Step 4. 모델링 및 평가

### 4-1. 기본 데이터

In [94]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(X, y, model, diff=False))
print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.51
              precision    recall  f1-score   support

           0       0.50      0.03      0.05       298
           1       0.51      0.97      0.67       308

    accuracy                           0.51       606
   macro avg       0.50      0.50      0.36       606
weighted avg       0.50      0.51      0.36       606

Model : RandomForestClassifier
Accuracy : 0.50
              precision    recall  f1-score   support

           0       0.49      0.48      0.48       298
           1       0.50      0.51      0.51       308

    accuracy                           0.50       606
   macro avg       0.49      0.49      0.49       606
weighted avg       0.49      0.50      0.49       606

Model : XGBClassifier
Accuracy : 0.51
              precision    recall  f1-score   support

           0       0.50      0.44      0.47       298
           1       0.52      0.58      0.55       308

    accuracy                           0.51       606


### 4-2. 차분 데이터

In [95]:
import numpy as np

X_diff = X.diff()
X_diff

Unnamed: 0_level_0,open,high,low,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-29,,,,
2010-06-30,1.358000,1.084000,1.152000,-0.012000
2010-07-01,-0.158000,-0.900000,-0.606000,-0.374000
2010-07-02,-0.400000,-0.564000,-0.312000,-0.552000
2010-07-06,-0.600000,-0.620000,-0.576000,-0.618000
...,...,...,...,...
2022-06-30,-17.969971,-5.150024,-10.229980,-12.049988
2022-07-01,7.469971,2.320007,9.769958,8.369995
2022-07-05,-12.000000,8.750000,-17.859985,17.410034
2022-07-06,23.340027,4.250000,33.059998,-4.000000


In [96]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(X_diff, y, model, diff=True))
print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.55
              precision    recall  f1-score   support

           0       0.58      0.07      0.12       276
           1       0.55      0.96      0.70       329

    accuracy                           0.55       605
   macro avg       0.57      0.51      0.41       605
weighted avg       0.56      0.55      0.43       605

Model : RandomForestClassifier
Accuracy : 0.53
              precision    recall  f1-score   support

           0       0.48      0.50      0.49       276
           1       0.57      0.55      0.56       329

    accuracy                           0.53       605
   macro avg       0.52      0.52      0.52       605
weighted avg       0.53      0.53      0.53       605

Model : XGBClassifier
Accuracy : 0.52
              precision    recall  f1-score   support

           0       0.48      0.50      0.49       276
           1       0.56      0.54      0.55       329

    accuracy                           0.52       605


### 4-3. 로그차분 데이터

In [97]:
import numpy as np

X_log_diff = np.log(X).diff()
X_log_diff

Unnamed: 0_level_0,open,high,low,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-06-29,,,,
2010-06-30,0.305548,0.196224,0.283969,-0.002515
2010-07-01,-0.031111,-0.160085,-0.139311,-0.081723
2010-07-02,-0.083382,-0.115182,-0.080084,-0.134312
2010-07-06,-0.139762,-0.144100,-0.167151,-0.175470
...,...,...,...,...
2022-06-30,-0.026331,-0.007454,-0.015460,-0.017736
2022-07-01,0.011030,0.003365,0.014770,0.012352
2022-07-05,-0.017778,0.012589,-0.027168,0.025215
2022-07-06,0.034293,0.006058,0.049722,-0.005737


In [98]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(X_log_diff, y, model, diff=True))
print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.54
              precision    recall  f1-score   support

           0       0.40      0.02      0.04       276
           1       0.54      0.97      0.70       329

    accuracy                           0.54       605
   macro avg       0.47      0.50      0.37       605
weighted avg       0.48      0.54      0.40       605

Model : RandomForestClassifier
Accuracy : 0.51
              precision    recall  f1-score   support

           0       0.47      0.51      0.49       276
           1       0.56      0.52      0.54       329

    accuracy                           0.51       605
   macro avg       0.51      0.51      0.51       605
weighted avg       0.52      0.51      0.51       605

Model : XGBClassifier
Accuracy : 0.50
              precision    recall  f1-score   support

           0       0.45      0.47      0.46       276
           1       0.54      0.52      0.53       329

    accuracy                           0.50       605
