### 코스닥 예측

### 모델링

In [37]:
import FinanceDataReader as fdr
import numpy as np
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

# CSV 파일에서 주식 코드 목록 불러오기
kosdak = pd.read_csv('KOSDAQ_result.csv')
# stock_codes = kosdak['Code'].astype(str).str.zfill(6).tolist()  # 주식 코드 목록 생성 (0으로 채우기)

# 주식 코드 전체 선택
stock_codes = kosdak['Code'].astype(str).str.zfill(6).tolist()

# 전체 주식 코드 목록 출력
print("Stock Codes:")
print(stock_codes)

# 각 기업별 모델을 저장할 딕셔너리
models = {}

for code in stock_codes:
    try:
        df = fdr.DataReader(code).dropna()  # 데이터 불러오기 및 결측값 제거
        if not df.empty:
            # 가격 변화율 계산
            # df['Change'] = df['Close'].pct_change()
            df = df.dropna()
            # EMA 계산 및 MACD, Signal Line 생성
            df['ema_10'] = df['Close'].ewm(span=10, adjust=False).mean()
            df['ema_12'] = df['Close'].ewm(span=12, adjust=False).mean()
            df['ema_26'] = df['Close'].ewm(span=26, adjust=False).mean()
            df['ema_30'] = df['Close'].ewm(span=30, adjust=False).mean()
            df['MACD'] = df['ema_12'] - df['ema_26']
            df['Signal Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
            # Signals 계산
            signals = 2  # 첫 번째 신호는 비교할 이전 데이터가 없기 때문에 2로 설정
#             for i in range(1, len(df)):
#                 if ((df['MACD'].iloc[i-1] < df['Signal Line'].iloc[i-1]) and (df['MACD'].iloc[i] > df['Signal Line'].iloc[i])):# or \
#                #    ((df['ema_10'].iloc[i-1] < df['ema_30'].iloc[i-1]) and (df['ema_10'].iloc[i] > df['ema_30'].iloc[i])):
#                     signals.append(1)  # 매수 신호
#                 elif ((df['MACD'].iloc[i-1] > df['Signal Line'].iloc[i-1]) and (df['MACD'].iloc[i] < df['Signal Line'].iloc[i])):# or \
#               #       ((df['ema_10'].iloc[i-1] > df['ema_30'].iloc[i-1]) and (df['ema_10'].iloc[i] < df['ema_30'].iloc[i])):
#                     signals.append(0)  # 매도 신호
#                 else:
#                     signals.append(2)  # 교차 없음
            df['Signal'] = signals
            # Position 열 초기화
            df['Position'] = 0
            for i in range(0, len(df), 100):
                avg_volume = df.iloc[i:i+100]['Volume'].mean()
                df.iloc[i:i+100, df.columns.get_loc('Position')] = np.where(df.iloc[i:i+100]['Volume'] > avg_volume, 1, 0)
                
            # 매수/매도 신호 업데이트
            df['Signal'] = np.where((df['Position'] == 1) & (df['Change'] < 0) & (df['Signal'] == 2), 1, df['Signal'])  # 매수
            df['Signal'] = np.where((df['Position'] == 1) & (df['Change'] > 0) & (df['Signal'] == 2), 0, df['Signal'])  # 매도
            
            # 볼린저 밴드 계산
            window = 5  # 이동평균 기간
            df['SMA'] = df['Close'].rolling(window=window).mean()  # 단순 이동평균
            df['STD'] = df['Close'].rolling(window=window).std()   # 표준편차
            df['Upper Band'] = df['SMA'] + (df['STD'] * 2)       # 상단 밴드
            df['Lower Band'] = df['SMA'] - (df['STD'] * 2)      # 하단 밴드
            
            # 결측값 제거
            df = df.dropna()
            # Position2 및 Position3 열 초기화
            df[['Position2', 'Position3']] = 0
            # 과매수 및 과매도 구간
            df['Position2'] = np.where(df['Close'] >= df['Upper Band'], 1, df['Position2'])  # 과매수 구간
            df['Position3'] = np.where(df['Close'] <= df['Lower Band'], 1, df['Position3'])  # 과매도 구간
            # 매수/매도 신호 업데이트
            df['Signal'] = np.where((df['Position2'] == 1) & (df['Signal'] == 2), 0, df['Signal'])  # 매도 추천
            df['Signal'] = np.where((df['Position3'] == 1) & (df['Signal'] == 2), 1, df['Signal'])  # 매수 추천

            
            # 불필요한 열 제거
            columns_to_remove = ['ema_10', 'ema_12', 'ema_26', 'ema_30', 'MACD', 'Signal Line', 'Position', 'SMA', 'STD', 'Upper Band', 'Lower Band', 'Position2', 'Position3']
            df = df.drop(columns=columns_to_remove)

            df.drop(columns=['Change'],inplace=True) # Recall이 너무 높아 삭제하는 열 
            
            # 다시 NaN 값 제거
            df = df.dropna()
            X = df.iloc[:,:-1]
            y = df['Signal']
            df['Signal'] = df['Signal'].shift(-1)
            df = df.dropna(subset=['Signal'])
            
             df.drop(['2024-08-22'],inplace=True)

            
            # 데이터 분할 (훈련 데이터와 테스트 데이터)
            X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42, stratify=y)
            # XGBoost 분류 모델 생성 및 학습
            model = xgb.XGBClassifier(
                objective='multi:softmax',
                eval_metric='mlogloss',
                learning_rate=0.1,
                n_estimators=100,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                num_class=3,  # 클래스 수 (매수, 매도, 신호 없음)
                random_state=42
            )
            model.fit(X_train, Y_train)
            # 테스트 데이터에 대해 예측 수행
            Y_pred = model.predict(X_test)
            # 모델 평가
            accuracy = accuracy_score(Y_test, Y_pred)
            conf_matrix = confusion_matrix(Y_test, Y_pred)
            class_report = classification_report(Y_test, Y_pred)
            # 모델 저장
            model_filename = f'model_{code}.joblib'
            joblib.dump(model, model_filename)
            models[code] = model_filename
            # 결과 출력
            print(f"Model for code {code} saved as {model_filename}")
            print(f"Accuracy for code {code}: {accuracy}")
            print(f"Confusion Matrix for code {code}:\n{conf_matrix}")
            print(f"Classification Report for code {code}:\n{class_report}")
    except Exception as e:
        print(f"Error processing code {code}: {e}")


Stock Codes:
['041190', '015750', '215200', '025980', '095660', '215000', '290650', '099320', '006730', '101160', '023160', '013030', '043150', '243070', '297890', '230360', '035890', '045100', '041830', '083450', '018310', '267980', '060250', '265520', '126700', '050890', '035600', '051370', '054950', '299900', '211050', '148150', '118990', '078150', '108230', '950170', '054450', '051500', '052400', '236200', '083310', '190510', '194700', '377450', '023760', '067280', '046440', '092730', '136540', '123410', '214180', '036190', '023910', '091580', '298540', '079960', '009300', '418470', '023600', '452400', '950190', '203650', '259630', '285490', '445180', '038110', '264450', '033290', '099440', '060560', '353810', '092460', '066620', '100700', '382800', '067900', '066700', '069510', '013310', '124560', '137950', '332570', '021080', '215360', '066590', '005990', '058630', '071200', '053700', '035610', '011560', '089850', '142210', '352090', '036630', '019590', '042500', '043260', '41779

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model for code 440290 saved as model_440290.joblib
Accuracy for code 440290: 0.8571428571428571
Confusion Matrix for code 440290:
[[ 0  2  3]
 [ 0  1  1]
 [ 0  0 35]]
Classification Report for code 440290:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.33      0.50      0.40         2
           2       0.90      1.00      0.95        35

    accuracy                           0.86        42
   macro avg       0.41      0.50      0.45        42
weighted avg       0.76      0.86      0.81        42

Model for code 067920 saved as model_067920.joblib
Accuracy for code 067920: 0.8180943214629451
Confusion Matrix for code 067920:
[[ 58  10  66]
 [  8  38  73]
 [ 16  16 754]]
Classification Report for code 067920:
              precision    recall  f1-score   support

           0       0.71      0.43      0.54       134
           1       0.59      0.32      0.42       119
           2       0.84      0.96  

### 코스닥 기업 예측

In [None]:
035890

In [19]:
df = fdr.DataReader('035890')
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-08-19,1380,1380,1364,1368,329416,-0.006536
2024-08-20,1390,1407,1379,1405,738954,0.027047
2024-08-21,1411,1459,1409,1458,1237833,0.037722
2024-08-22,1448,1459,1405,1406,640682,-0.035665
2024-08-23,1415,1433,1415,1431,72332,0.017781


In [38]:
import joblib
import numpy as np
import pandas as pd
import FinanceDataReader as fdr
from collections import defaultdict

def load_model_and_predict(company_code, recent_data):
    # 기업 코드에 맞는 모델 파일명
    model_filename = f'model_{company_code}.joblib'
    
    try:
        # 모델 불러오기
        model = joblib.load(model_filename)
        
        # 예측 수행
        predicted_signal = model.predict(recent_data)
        
        # 예측 결과 해석
        signal_map = {0: 'Sell', 1: 'Buy', 2: 'No Signal'}
        return predicted_signal[0], signal_map[predicted_signal[0]]
    
    except FileNotFoundError:
        return None, f"Model file for code {company_code} not found."
    except Exception as e:
        return None, f"Error during prediction: {e}"

def display_stock_codes(codes):
    print("Available Stock Codes:")
    for i, code in enumerate(codes):
        if i > 0 and i % 10 == 0:
            print()  # 줄바꿈
        print(code, end=' ')
    print()  # 마지막 줄바꿈

def main():
    # CSV 파일에서 주식 코드 목록 불러오기
    kosdak = pd.read_csv('KOSDAQ_result.csv')
    
    # 주식 코드 목록 선택
    stock_codes = kosdak['Code'].astype(str).str.zfill(6).tolist()
    
    # 코스닥 주식 코드 목록 출력
    display_stock_codes(stock_codes)
    
    # 각 시그널의 개수를 저장할 딕셔너리
    signal_counts = defaultdict(int)
    
    # 모든 주식 코드에 대해 예측 수행
    for company_code in stock_codes:
        # 가장 최근 하루치 데이터 가져오기
        df = fdr.DataReader(company_code).dropna()
        recent_data = df.iloc[-1][['Open', 'High', 'Low', 'Close', 'Volume']].values.reshape(1, -1) # , 'Change'
        
        # 예측 수행
        signal, result = load_model_and_predict(company_code, recent_data)
        
        if signal is not None:
            signal_counts[signal] += 1
            print(f'For company code {company_code}, the predicted signal for the next day is: {result}')
        else:
            print(f'Error for company code {company_code}: {result}')
    
    # 각 시그널의 개수 출력
    print("\nSignal Counts Summary:")
    print(f"Sell (0): {signal_counts[0]}")
    print(f"Buy (1): {signal_counts[1]}")
    print(f"No Signal (2): {signal_counts[2]}")

if __name__ == "__main__":
    main()


Available Stock Codes:
041190 015750 215200 025980 095660 215000 290650 099320 006730 101160 
023160 013030 043150 243070 297890 230360 035890 045100 041830 083450 
018310 267980 060250 265520 126700 050890 035600 051370 054950 299900 
211050 148150 118990 078150 108230 950170 054450 051500 052400 236200 
083310 190510 194700 377450 023760 067280 046440 092730 136540 123410 
214180 036190 023910 091580 298540 079960 009300 418470 023600 452400 
950190 203650 259630 285490 445180 038110 264450 033290 099440 060560 
353810 092460 066620 100700 382800 067900 066700 069510 013310 124560 
137950 332570 021080 215360 066590 005990 058630 071200 053700 035610 
011560 089850 142210 352090 036630 019590 042500 043260 417790 302430 
111710 234300 065710 086670 059210 024880 054040 038460 012790 263690 
014200 009780 241520 122310 036640 052790 216050 208140 036670 339950 
038680 158430 048430 115440 007370 024800 155650 094970 130580 054930 
241790 224110 082210 212560 090410 250000 105760 09484