In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

%matplotlib inline
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'NanumGothic'

## 주식 데이터 전처리

In [2]:
# 주식 데이터 가져오기 (삼성)
stock = pd.read_csv('./day_stock_data.csv')

In [3]:
# 불필요한 컬럼 삭제
stock = stock.drop('Unnamed: 0', axis=1)
# 데이터 정렬 및 컬럼 결합을 위해 string 타입으로 변환
stock = stock.astype('string')

In [5]:
# 시간 데이터 정렬 (4자리로 통일)
for i in range(len(stock)):
    if len(stock['날짜'][i]) == 3:
        stock['날짜'][i] = '0' + stock['날짜'][i]
    else:
        pass

In [6]:
stock

Unnamed: 0,날짜,시가,고가,저가,종가,거래량,주가변동
0,19800104,42,43,41,41,528022,1
1,19800105,40,42,40,42,1580051,1
2,19800107,43,44,43,44,2425288,1
3,19800108,47,47,45,47,5396669,1
4,19800109,47,49,46,49,4993123,1
...,...,...,...,...,...,...,...
11333,20220512,65200,65500,64900,64900,16414188,1
11334,20220513,65300,66700,65200,66500,14551536,0
11335,20220516,67100,67400,66100,66300,11937555,1
11336,20220517,66600,67900,66600,67600,15680447,1


In [8]:
# 날짜, 시간 데이터 결합
#stock['날짜'] = stock['날짜'] + stock['시간']

# 시간 데이터 드롭, 결합한 날짜 데이터 인덱스화
#stock = stock.drop('시간', axis=1)
stock['날짜'] = pd.to_datetime(stock['날짜']).dt.strftime('%y-%m-%d %H:%M')
stock = stock.set_index('날짜')

In [9]:
stock_sorted = stock.sort_index()
stock_sorted

Unnamed: 0_level_0,시가,고가,저가,종가,거래량,주가변동
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00-01-04 00:00,6000,6110,5660,6110,74198350,0
00-01-05 00:00,5800,6060,5520,5580,74680200,1
00-01-06 00:00,5750,5780,5580,5620,54390500,0
00-01-07 00:00,5560,5670,5360,5540,40309750,1
00-01-10 00:00,5600,5770,5580,5770,46880750,0
...,...,...,...,...,...,...
99-12-22 00:00,5140,5350,5060,5280,41342900,1
99-12-23 00:00,5280,5700,5240,5580,52863100,0
99-12-24 00:00,5700,5720,5360,5460,35422300,0
99-12-27 00:00,5460,5490,5140,5220,46702850,1


In [10]:
from sklearn.model_selection import train_test_split

# 훈련 + 검증 세트, 테스트 세트로 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(
      stock_sorted.drop('종가', axis=1)
    , stock_sorted['종가']
    , random_state=42
)

# 훈련, 검증 세트 분할
X_train, X_valid, y_train, y_valid = train_test_split(
      X_trainval
    , y_trainval
    , random_state=42
)

In [11]:
X_train.shape, y_train.shape

((6377, 5), (6377,))

In [12]:
# 훈련 세트 스케일링
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler 사용
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)