In [31]:
import FinanceDataReader as fdr
import numpy as np
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight


# 분석할 주식 코드 목록
stock_codes = ['086520','041190', '015750','328130','073570','036220']

# 각 기업별 모델을 저장할 딕셔너리
models = {}

for code in stock_codes:
    try:
        df = fdr.DataReader(code).dropna()  # 데이터 불러오기 및 결측값 제거
        df.drop(['2024-08-22'])
        if not df.empty:
            # 가격 변화율 계산
            # 가격 변화율 계산
            df['Change'] = df['Close'].pct_change()
            df = df.dropna()
            
            # EMA 계산 및 MACD, Signal Line 생성
            df['ema_10'] = df['Close'].ewm(span=10, adjust=False).mean()
            df['ema_12'] = df['Close'].ewm(span=12, adjust=False).mean()
            df['ema_26'] = df['Close'].ewm(span=26, adjust=False).mean()
            df['ema_30'] = df['Close'].ewm(span=30, adjust=False).mean()
            df['MACD'] = df['ema_12'] - df['ema_26']
            df['Signal Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
            
            # Signals 계산
            signals = [2]  # 첫 번째 신호는 비교할 이전 데이터가 없기 때문에 2로 설정
            for i in range(1, len(df)):
                if ((df['MACD'].iloc[i-1] < df['Signal Line'].iloc[i-1]) and (df['MACD'].iloc[i] > df['Signal Line'].iloc[i])) or \
                   ((df['ema_10'].iloc[i-1] < df['ema_30'].iloc[i-1]) and (df['ema_10'].iloc[i] > df['ema_30'].iloc[i])):
                    signals.append(1)  # 매수 신호
                elif ((df['MACD'].iloc[i-1] > df['Signal Line'].iloc[i-1]) and (df['MACD'].iloc[i] < df['Signal Line'].iloc[i])) or \
                     ((df['ema_10'].iloc[i-1] > df['ema_30'].iloc[i-1]) and (df['ema_10'].iloc[i] < df['ema_30'].iloc[i])):
                    signals.append(0)  # 매도 신호
                else:
                    signals.append(2)  # 교차 없음
            df['Signal'] = signals
            
            # Position 열 초기화
            df['Position'] = 0
            for i in range(0, len(df), 100):
                avg_volume = df.iloc[i:i+100]['Volume'].mean()
                df.iloc[i:i+100, df.columns.get_loc('Position')] = np.where(df.iloc[i:i+100]['Volume'] > avg_volume, 1, 0)
            
            # 매수/매도 신호 업데이트
            df['Signal'] = np.where((df['Position'] == 1) & (df['Change'] < 0), 1, df['Signal'])  # 매수
            df['Signal'] = np.where((df['Position'] == 1) & (df['Change'] > 0), 0, df['Signal'])  # 매도
            
            # # 볼린저 밴드 계산
            # window = 5  # 이동평균 기간
            # df['SMA'] = df['Close'].rolling(window=window).mean()  # 단순 이동평균
            # df['STD'] = df['Close'].rolling(window=window).std()   # 표준편차
            # df['Upper Band'] = df['SMA'] + (df['STD'] * 1.2)       # 상단 밴드
            # df['Lower Band'] = df['SMA'] - (df['STD'] * 1.2)      # 하단 밴드
            
            # # 결측값 제거
            # df = df.dropna()
            
            # # Position2 및 Position3 열 초기화
            # df[['Position2', 'Position3']] = 0
            
            # # 과매수 및 과매도 구간
            # df['Position2'] = np.where(df['Close'] >= df['Upper Band'], 1, df['Position2'])  # 과매수 구간
            # df['Position3'] = np.where(df['Close'] <= df['Lower Band'], 1, df['Position3'])  # 과매도 구간
            
            # # 매수/매도 신호 업데이트
            # df['Signal'] = np.where(df['Position2'] == 1, 0, df['Signal'])  # 매도 추천
            # df['Signal'] = np.where(df['Position3'] == 1, 1, df['Signal'])  # 매수 추천
            
            # 불필요한 열 제거
            columns_to_remove = ['ema_10', 'ema_12', 'ema_26', 'ema_30', 'MACD', 'Signal Line', 'Position']
            df = df.drop(columns=columns_to_remove)
            
            # 다시 NaN 값 제거
            df = df.dropna()

           #
            # 윈도우 슬라이싱을 통해 2차원 입력 데이터 생성
            # window = 10  # 5일 동안의 데이터를 사용
            # X, y = [], []
            # for i in range(window, len(df)):
            #     X.append(df[['Open', 'High', 'Low', 'Close', 'Volume', 'Change']].iloc[i-window:i].values.flatten())
            #     y.append(df['Signal'].iloc[i])

            # X = np.array(X)
            # y = np.array(y)

            # # 데이터 분할 (훈련 데이터와 테스트 데이터)
            # X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42, stratify=y)
            #

            # 특성 벡터와 타겟 벡터 설정
            features = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Change']].values
            targets = df['Signal'].values

            # 데이터 분할 (훈련 데이터와 테스트 데이터)
            X_train, X_test, Y_train, Y_test = train_test_split(features, targets, test_size=0.3, shuffle=True, random_state=42,stratify=targets)
            

            

            # 클래스 가중치 계산
            classes = np.unique(Y_train)
            class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=Y_train)
            class_weight_dict = dict(zip(classes, class_weights))
           
            # XGBoost 분류 모델 생성 및 학습
            model = xgb.XGBClassifier(
                objective='multi:softmax',
                eval_metric='mlogloss',
                learning_rate=0.1,
                n_estimators=100,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                num_class=3,  # 클래스 수 (매수, 매도, 신호 없음)
                random_state=42,
                scale_pos_weight=class_weight_dict[1]  # 가중치 설정
            )
            model.fit(X_train, Y_train)

            # 테스트 데이터에 대해 예측 수행
            Y_pred = model.predict(X_test)

            # 모델 평가
            accuracy = accuracy_score(Y_test, Y_pred)
            conf_matrix = confusion_matrix(Y_test, Y_pred)
            class_report = classification_report(Y_test, Y_pred)

            # 모델 저장
            model_filename = f'model_{code}.joblib'
            joblib.dump(model, model_filename)
            models[code] = model_filename

            # 결과 출력
            print(f"Model for code {code} saved as {model_filename}")
            print(f"Accuracy for code {code}: {accuracy}")
            print(f"Confusion Matrix for code {code}:\n{conf_matrix}")
            print(f"Classification Report for code {code}:\n{class_report}")

    except Exception as e:
        print(f"Error processing code {code}: {e}")


Parameters: { "scale_pos_weight" } are not used.



Model for code 086520 saved as model_086520.joblib
Accuracy for code 086520: 0.45166402535657685
Confusion Matrix for code 086520:
[[ 70  50 223]
 [ 52  61 223]
 [ 67  77 439]]
Classification Report for code 086520:
              precision    recall  f1-score   support

           0       0.37      0.20      0.26       343
           1       0.32      0.18      0.23       336
           2       0.50      0.75      0.60       583

    accuracy                           0.45      1262
   macro avg       0.40      0.38      0.36      1262
weighted avg       0.42      0.45      0.41      1262



Parameters: { "scale_pos_weight" } are not used.



Model for code 041190 saved as model_041190.joblib
Accuracy for code 041190: 0.5064281721632197
Confusion Matrix for code 041190:
[[ 80  50 311]
 [ 54  87 318]
 [ 68  82 739]]
Classification Report for code 041190:
              precision    recall  f1-score   support

           0       0.40      0.18      0.25       441
           1       0.40      0.19      0.26       459
           2       0.54      0.83      0.65       889

    accuracy                           0.51      1789
   macro avg       0.44      0.40      0.39      1789
weighted avg       0.47      0.51      0.45      1789



Parameters: { "scale_pos_weight" } are not used.



Model for code 015750 saved as model_015750.joblib
Accuracy for code 015750: 0.47939866369710465
Confusion Matrix for code 015750:
[[ 74  43 344]
 [ 69  73 337]
 [ 75  67 714]]
Classification Report for code 015750:
              precision    recall  f1-score   support

           0       0.34      0.16      0.22       461
           1       0.40      0.15      0.22       479
           2       0.51      0.83      0.63       856

    accuracy                           0.48      1796
   macro avg       0.42      0.38      0.36      1796
weighted avg       0.44      0.48      0.42      1796



Parameters: { "scale_pos_weight" } are not used.



Model for code 328130 saved as model_328130.joblib
Accuracy for code 328130: 0.44666666666666666
Confusion Matrix for code 328130:
[[12 11 14]
 [ 7  8 23]
 [10 18 47]]
Classification Report for code 328130:
              precision    recall  f1-score   support

           0       0.41      0.32      0.36        37
           1       0.22      0.21      0.21        38
           2       0.56      0.63      0.59        75

    accuracy                           0.45       150
   macro avg       0.40      0.39      0.39       150
weighted avg       0.44      0.45      0.44       150



Parameters: { "scale_pos_weight" } are not used.



Model for code 073570 saved as model_073570.joblib
Accuracy for code 073570: 0.5013106159895151
Confusion Matrix for code 073570:
[[ 43  68 244]
 [ 41 133 274]
 [ 51  83 589]]
Classification Report for code 073570:
              precision    recall  f1-score   support

           0       0.32      0.12      0.18       355
           1       0.47      0.30      0.36       448
           2       0.53      0.81      0.64       723

    accuracy                           0.50      1526
   macro avg       0.44      0.41      0.39      1526
weighted avg       0.46      0.50      0.45      1526



Parameters: { "scale_pos_weight" } are not used.



Model for code 036220 saved as model_036220.joblib
Accuracy for code 036220: 0.5072254335260116
Confusion Matrix for code 036220:
[[ 33  25 114]
 [ 32  68  99]
 [ 31  40 250]]
Classification Report for code 036220:
              precision    recall  f1-score   support

           0       0.34      0.19      0.25       172
           1       0.51      0.34      0.41       199
           2       0.54      0.78      0.64       321

    accuracy                           0.51       692
   macro avg       0.46      0.44      0.43       692
weighted avg       0.48      0.51      0.47       692



In [33]:
import joblib  # For saving and loading models
import FinanceDataReader as fdr
import numpy as np

# 특정 기업 코드
company_code = '035890'  # 예측을 수행할 주식 코드

# 데이터 불러오기
df = fdr.DataReader(company_code).dropna()

# 최근 10일치 데이터 추출
window = 10
new_data = df[['Open', 'High', 'Low', 'Close', 'Volume', 'Change']].iloc[-window:].values.flatten()

# 데이터를 2D 형태로 변환 (모델 입력에 맞게)
new_data = new_data.reshape(1, -1)

# 기업 코드에 맞는 모델 파일명
model_filename = f'model_{company_code}.joblib'

try:
    # 모델 불러오기
    model = joblib.load(model_filename)
    
    # 예측 수행
    predicted_signal = model.predict(new_data)
    
    # 예측 결과 해석
    signal_map = {0: 'Sell', 1: 'Buy', 2: 'No Signal'}
    print(f'For company code {company_code}, the predicted signal is {signal_map[predicted_signal[0]]}')
    
except FileNotFoundError:
    print(f"Model file for code {company_code} not found.")
except Exception as e:
    print(f"Error during prediction: {e}")


Error during prediction: Feature shape mismatch, expected: 6, got 60
