In [70]:
import yfinance as yf
import numpy as np
import pandas as pd
import os

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 메세지 숨기기 
import warnings
warnings.filterwarnings(action='ignore')

%matplotlib inline

# 관련 라이브러리 임포트 
import matplotlib.font_manager as fm

#  한글글꼴로 변경
# plt.rcParams['font.family'] = '한글글꼴명'
plt.rcParams['font.size'] = 11.0
# plt.rcParams['font.family'] = 'batang'
plt.rcParams['font.family'] = 'Malgun Gothic'

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
matplotlib.rcParams['axes.unicode_minus'] = False

# 그래프 기본 크기 설정 
plt.rcParams['figure.figsize'] = [10, 6]

# 데이터 불러오기
- yfinance에서 삼성전자 시고저종,거래량 및 종목별(삼성전자, S&P500, DJIA, Nasdaq, 환율, 금리) 종가데이터 수집
- FinanceDataReader에서 change, Dividends, Dollar, Dollar_rate 추가 수집
- 미국 지수들은 전일 종가가 금일 국내증시에 영향을 주므로 전일 종가를 금일 날짜로 받음 

In [71]:
df = pd.read_csv('data/real_feature.csv')
df

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Adj Close,Volume,Dividends,Change,Dollar,Dollar_rate,S&P500,DJIA,Nasdaq,interest,SamSung_close
0,0,2020-01-02,55500.0,56000.0,55000.0,52058.13281,12993228.0,0,-1.08,1157.800000,0.00,3221.290039,28462.14063,8945.990234,1.895,55200
1,1,2020-01-03,56000.0,56600.0,54900.0,52341.05469,15422255.0,0,0.54,1156.900000,-0.08,3257.850098,28868.80078,9092.190430,1.882,55500
2,2,2020-01-06,54900.0,55600.0,54600.0,52341.05469,10278951.0,0,0.00,1162.700000,0.50,3234.850098,28634.88086,9020.769531,1.788,55500
3,3,2020-01-07,55700.0,56400.0,55600.0,52623.98438,10009778.0,0,0.54,1169.600000,0.59,3246.280029,28703.38086,9071.469727,1.811,55800
4,4,2020-01-08,56200.0,57400.0,55900.0,53567.05859,23501171.0,0,1.79,1165.300000,-0.37,3237.179932,28583.67969,9068.580078,1.827,56800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,479,2021-12-08,78300.0,78600.0,77100.0,77400.00000,21558340.0,0,0.00,1181.200000,-0.21,4686.750000,35719.42969,15686.919920,1.480,77400
480,480,2021-12-09,77400.0,78200.0,77000.0,78200.00000,21604528.0,0,1.03,1176.400000,-0.41,4701.209961,35754.75000,15786.990230,1.509,78200
481,481,2021-12-10,77400.0,77600.0,76800.0,76900.00000,9155219.0,0,-1.66,1173.800000,-0.22,4667.450195,35754.69141,15517.370120,1.487,76900
482,482,2021-12-13,,,,,,0,,1177.329956,,4712.020020,35970.98828,15630.599610,1.489,76800


# 전처리
- 결측치 확인 => S&P500, DJIA, Nasdaq 전일종가 / 배당 0 
- 분류 분석용 y라벨(상승, 하락, 보합) 생성

## S&P500, DJIA, Nasdaq 결측치 => 전일종가

In [72]:
df.columns

Index(['Unnamed: 0', 'Date', 'Open', 'High', 'Low', 'Adj Close', 'Volume',
       'Dividends', 'Change', 'Dollar', 'Dollar_rate', 'S&P500', 'DJIA',
       'Nasdaq', 'interest', 'SamSung_close'],
      dtype='object')

In [73]:
df.isnull().sum()

Unnamed: 0        0
Date              0
Open              2
High              2
Low               2
Adj Close         2
Volume            2
Dividends         0
Change            2
Dollar            0
Dollar_rate       2
S&P500            0
DJIA              0
Nasdaq            0
interest         14
SamSung_close     0
dtype: int64

In [74]:
idx = df[df['S&P500'].isnull()].index
idx

Int64Index([], dtype='int64')

In [75]:
# S&P500:10, DJIA:11, Nasdaq:12
for i in idx:
    df.iloc[i, 10] = df.iloc[i-1, 10]
    df.iloc[i, 11] = df.iloc[i-1, 11]
    df.iloc[i, 12] = df.iloc[i-1, 12]

In [76]:
df.isnull().sum()

Unnamed: 0        0
Date              0
Open              2
High              2
Low               2
Adj Close         2
Volume            2
Dividends         0
Change            2
Dollar            0
Dollar_rate       2
S&P500            0
DJIA              0
Nasdaq            0
interest         14
SamSung_close     0
dtype: int64

In [77]:
df.iloc[11:13,:]

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Adj Close,Volume,Dividends,Change,Dollar,Dollar_rate,S&P500,DJIA,Nasdaq,interest,SamSung_close
11,11,2020-01-17,61900.0,62000.0,61000.0,57810.92969,16025661.0,0,0.99,1159.6,0.07,3316.810059,29297.64063,9357.129883,1.809,61300
12,12,2020-01-20,62000.0,62800.0,61700.0,58848.32031,12528855.0,0,1.79,1158.8,-0.07,3329.620117,29348.09961,9388.94043,1.836,62400


## 배당 결측치 => 0

In [78]:
df.fillna(0, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Adj Close,Volume,Dividends,Change,Dollar,Dollar_rate,S&P500,DJIA,Nasdaq,interest,SamSung_close
0,0,2020-01-02,55500.0,56000.0,55000.0,52058.13281,12993228.0,0,-1.08,1157.800000,0.00,3221.290039,28462.14063,8945.990234,1.895,55200
1,1,2020-01-03,56000.0,56600.0,54900.0,52341.05469,15422255.0,0,0.54,1156.900000,-0.08,3257.850098,28868.80078,9092.190430,1.882,55500
2,2,2020-01-06,54900.0,55600.0,54600.0,52341.05469,10278951.0,0,0.00,1162.700000,0.50,3234.850098,28634.88086,9020.769531,1.788,55500
3,3,2020-01-07,55700.0,56400.0,55600.0,52623.98438,10009778.0,0,0.54,1169.600000,0.59,3246.280029,28703.38086,9071.469727,1.811,55800
4,4,2020-01-08,56200.0,57400.0,55900.0,53567.05859,23501171.0,0,1.79,1165.300000,-0.37,3237.179932,28583.67969,9068.580078,1.827,56800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,479,2021-12-08,78300.0,78600.0,77100.0,77400.00000,21558340.0,0,0.00,1181.200000,-0.21,4686.750000,35719.42969,15686.919920,1.480,77400
480,480,2021-12-09,77400.0,78200.0,77000.0,78200.00000,21604528.0,0,1.03,1176.400000,-0.41,4701.209961,35754.75000,15786.990230,1.509,78200
481,481,2021-12-10,77400.0,77600.0,76800.0,76900.00000,9155219.0,0,-1.66,1173.800000,-0.22,4667.450195,35754.69141,15517.370120,1.487,76900
482,482,2021-12-13,0.0,0.0,0.0,0.00000,0.0,0,0.00,1177.329956,0.00,4712.020020,35970.98828,15630.599610,1.489,76800


In [79]:
df.isnull().sum()

Unnamed: 0       0
Date             0
Open             0
High             0
Low              0
Adj Close        0
Volume           0
Dividends        0
Change           0
Dollar           0
Dollar_rate      0
S&P500           0
DJIA             0
Nasdaq           0
interest         0
SamSung_close    0
dtype: int64

## 분류 분석용 타겟(상승, 하락, 보합) 생성

In [80]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Adj Close,Volume,Dividends,Change,Dollar,Dollar_rate,S&P500,DJIA,Nasdaq,interest,SamSung_close
0,0,2020-01-02,55500.0,56000.0,55000.0,52058.13281,12993228.0,0,-1.08,1157.8,0.0,3221.290039,28462.14063,8945.990234,1.895,55200
1,1,2020-01-03,56000.0,56600.0,54900.0,52341.05469,15422255.0,0,0.54,1156.9,-0.08,3257.850098,28868.80078,9092.19043,1.882,55500
2,2,2020-01-06,54900.0,55600.0,54600.0,52341.05469,10278951.0,0,0.0,1162.7,0.5,3234.850098,28634.88086,9020.769531,1.788,55500
3,3,2020-01-07,55700.0,56400.0,55600.0,52623.98438,10009778.0,0,0.54,1169.6,0.59,3246.280029,28703.38086,9071.469727,1.811,55800
4,4,2020-01-08,56200.0,57400.0,55900.0,53567.05859,23501171.0,0,1.79,1165.3,-0.37,3237.179932,28583.67969,9068.580078,1.827,56800


In [81]:
idx1

Int64Index([  1,   3,   4,   5,   6,   7,  10,  11,  12,  14,
            ...
            461, 462, 466, 467, 468, 474, 475, 477, 478, 480],
           dtype='int64', length=230)

In [82]:
# 양수 idx1, 음수 idx2, 보합 idx3
idx1 = df[df['Change'] > 0].index
idx2 = df[df['Change'] < 0].index
idx3 = df[df['Change'] == 0].index

In [83]:
# Change: 7
df['Target'] = [0]*len(df)
for i in idx1:
    df.at[i, 'Target'] = 1
for i in idx2:
    df.at[i, 'Target'] = 2
for i in idx3:
    df.at[i, 'Target'] = 3

df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Adj Close,Volume,Dividends,Change,Dollar,Dollar_rate,S&P500,DJIA,Nasdaq,interest,SamSung_close,Target
0,0,2020-01-02,55500.0,56000.0,55000.0,52058.13281,12993228.0,0,-1.08,1157.8,0.0,3221.290039,28462.14063,8945.990234,1.895,55200,2
1,1,2020-01-03,56000.0,56600.0,54900.0,52341.05469,15422255.0,0,0.54,1156.9,-0.08,3257.850098,28868.80078,9092.19043,1.882,55500,1
2,2,2020-01-06,54900.0,55600.0,54600.0,52341.05469,10278951.0,0,0.0,1162.7,0.5,3234.850098,28634.88086,9020.769531,1.788,55500,3
3,3,2020-01-07,55700.0,56400.0,55600.0,52623.98438,10009778.0,0,0.54,1169.6,0.59,3246.280029,28703.38086,9071.469727,1.811,55800,1
4,4,2020-01-08,56200.0,57400.0,55900.0,53567.05859,23501171.0,0,1.79,1165.3,-0.37,3237.179932,28583.67969,9068.580078,1.827,56800,1


In [84]:
# 'Date'를 인덱스로 변경
df.set_index('Date', inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 0,Open,High,Low,Adj Close,Volume,Dividends,Change,Dollar,Dollar_rate,S&P500,DJIA,Nasdaq,interest,SamSung_close,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-01-02,0,55500.0,56000.0,55000.0,52058.13281,12993228.0,0,-1.08,1157.8,0.0,3221.290039,28462.14063,8945.990234,1.895,55200,2
2020-01-03,1,56000.0,56600.0,54900.0,52341.05469,15422255.0,0,0.54,1156.9,-0.08,3257.850098,28868.80078,9092.19043,1.882,55500,1
2020-01-06,2,54900.0,55600.0,54600.0,52341.05469,10278951.0,0,0.0,1162.7,0.5,3234.850098,28634.88086,9020.769531,1.788,55500,3
2020-01-07,3,55700.0,56400.0,55600.0,52623.98438,10009778.0,0,0.54,1169.6,0.59,3246.280029,28703.38086,9071.469727,1.811,55800,1
2020-01-08,4,56200.0,57400.0,55900.0,53567.05859,23501171.0,0,1.79,1165.3,-0.37,3237.179932,28583.67969,9068.580078,1.827,56800,1
