## Setting

In [288]:
# Parameter Setting
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
folder_name = 'Research-(D5) Synthesized input model'

config = {
    'learning_rate': 1e-4,
    'batch_size': 32,
    'seq_length': 5,

    'shuffle': False,
    'criterion': torch.nn.CrossEntropyLoss(),
    'seed': 42,
    'valid_ratio': 0.2,
    'test_ratio': 0.2,
    'max_length': 512,
    'n_epochs': 3000,
    'early_stop': 100,
}

feature = [
    # X_1
    # 'input_ids',
    # 'attention_mask',
    # 'section_dummy',
    # 'publication_dummy',

    # X_2
    # 1. tech indicator
    # 'Open',
    # 'High',
    # 'Low',
    # 'Close',
    # 'Volume',
    # 'Dividends',
    # 'Stock Splits',
    'today_return',
    # 'today_return_cate',
    # 'Sma',
    # 'Rsi',
    # 'Kd',
    # 'Ema_12',
    # 'Ema_26',
    # 'Macd',
    'sentiment',

    # 2. market index
    '^DJI',
    '^GSPC',
    '^NDX',
    '^IXIC',
    '^SOX',

    # y
    # '1_day_return',
    # '2_day_return',
    # '3_day_return',
    # '4_day_return',
    # '5_day_return',
    # '1_day_return_cate',
    # '2_day_return_cate',
    # '3_day_return_cate',
    # '4_day_return_cate',
    # '5_day_return_cate',
    # '^DJI', '^DJI_1_day_return', '^GSPC', '^GSPC_1_day_return',
    #    '^NDX', '^NDX_1_day_return', '^IXIC', '^IXIC_1_day_return', '^SOX',
    #    '^SOX_1_day_return',
    # 'excess_return_^DJI',
    # 'excess_return_^DJI_cate',
    # 'excess_return_^GSPC',
    'excess_return_^GSPC_cate',
    # 'excess_return_^NDX',
    # 'excess_return_^NDX_cate',
    # 'excess_return_^IXIC',
    # 'excess_return_^IXIC_cate',
    # 'excess_return_^SOX',
    # 'excess_return_^SOX_cate',


    # Do not mark the datetime, it's for operation
    'datetime',
    ]

# All the news dataset

time_start = '2016-01-01T00:00:00'
time_end = '2019-12-31T00:00:00'

time_start = pd.to_datetime('2016-01-01T00:00:00')
time_end = pd.to_datetime('2019-12-31T00:00:00')

print(len(feature)-2)

company_list = [
    "Information Technology",
    "Health Care",
    "Financials",
    "Industrials",
    "Consumer Discretionary",
    "Energy",
    "Materials",
    "Communication Services",
    "Utilities",
    "Real Estate",
    "Consumer Staples"
]

process_id = 0  #26

company_name = company_list[process_id]
config_2 = {'input_path': '/content/drive/MyDrive/Colab Notebooks/'+folder_name+'/data/2_'+company_name+'_for_model.csv',
            'save_path': '/content/drive/MyDrive/Colab Notebooks/'+folder_name+'/model_saved/model.ckpt',
            }

7


In [282]:
# Google
from google.colab import drive
drive.mount('/content/drive')

# pip installation
!pip install transformers
!pip install yfinance

import yfinance as yf

# Basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# PyTorch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import XLNetModel, XLNetTokenizer, BertTokenizer, BertModel

# others
from datetime import datetime, timedelta
from tqdm import tqdm
from torchsummary import summary
import ast

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [283]:
def same_seed(seed):
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Set seed for reproducibility
same_seed(config['seed'])


## Benchmark: Mean of excess return

### (1) Get data

In [284]:
def datetime_func(df):
    # Convert the index to a datetime column
    df['datetime'] = df.index

    # Remove the time zone information
    df['datetime'] = pd.to_datetime(df['datetime']).dt.tz_localize(None).dt.date

    return df

# (1) Today Trend
def day_return(df, n, id):
  # Calculate Numerical
  df['today_return'] = (df['Close'] - df['Open']) / df['Open']

  # Calculate categorical
  df['today_return_cate'] = (df['Close'] - df['Open']).apply(lambda x: 1 if x >= 0 else 0)

  # Convert numerical trend to categorical
  df['today_return_cate'] = df['today_return_cate'].astype('category')

  for days in range(1, n+1):
    df[f'{id}_{days}_day_return'] = (df['Close'].shift(-days) / df['Open'].shift(-1) - 1)
  # for days in range(1, n+1):
  #   df[f'{id}_{days}_day_return_cate'] = df[f'{days}_day_return'].apply(lambda x: 1 if x >= 0 else 0)

  return df


In [285]:
# ids = ["^DJI", "^GSPC", "^NDX", "^IXIC", "^SOX", '^NYA', 'XLK']  # DJSI
ids = ["^GSPC", 'XLK']  # DJSI

df_list = []

for id in ids:
  dji = yf.Ticker(id)
  df = dji.history(period='Max')
  df = datetime_func(df)
  df = day_return(df, n=1, id=id)
  df[(str(id))] = (df['Close']-df['Open']) / df['Open']
  df = df[[(str(id)), f'{id}_1_day_return', 'datetime']]
  df_list.append(df)

# Merge dataframes
df_index = df_list[0]
for df in df_list[1:]:
  df_index = pd.merge(df_index, df, on='datetime', how='inner')

df_index

Unnamed: 0,^GSPC,^GSPC_1_day_return,datetime,XLK,XLK_1_day_return
0,0.000607,0.020747,1998-12-22,-0.011090,0.001908
1,0.020747,-0.001848,1998-12-23,0.001908,-0.003810
2,-0.001848,-0.000636,1998-12-24,-0.003810,-0.012241
3,-0.000636,0.013317,1998-12-28,-0.012241,-0.001898
4,0.013317,-0.007956,1998-12-29,-0.001898,0.001912
...,...,...,...,...,...
6243,0.007199,0.006437,2023-10-16,0.007933,0.008030
6244,0.006437,-0.009811,2023-10-17,0.008030,-0.003269
6245,-0.009811,-0.010034,2023-10-18,-0.003269,-0.009680
6246,-0.010034,-0.011627,2023-10-19,-0.009680,-0.016439


### (2) excess return

In [286]:
for id in ids:
  df_index[f'excess_return_{id}'] = df_index['XLK_1_day_return'] - df_index[f'{id}_1_day_return']
  df_index[f'excess_return_{id}_cate'] = df_index[f'excess_return_{id}'].apply(lambda x: 1 if x >= 0 else 0)

df = df_index
del df_index

### (3) SMA

In [289]:
# 计算5日SMA
sma_period = 150
df['SMA'] = df['excess_return_^GSPC'].rolling(window=sma_period).mean()
# df['SMA'] = 0
df['SMA']

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
          ...   
6243    0.000705
6244    0.000697
6245    0.000628
6246    0.000547
6247         NaN
Name: SMA, Length: 6248, dtype: float64

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
          ...   
6243    0.000705
6244    0.000697
6245    0.000628
6246    0.000547
6247         NaN
Name: SMA, Length: 6248, dtype: float64

In [290]:
df['pred'] = df['SMA'] - df['^GSPC_1_day_return']
df

Unnamed: 0,^GSPC,^GSPC_1_day_return,datetime,XLK,XLK_1_day_return,excess_return_^GSPC,excess_return_^GSPC_cate,excess_return_XLK,excess_return_XLK_cate,SMA,pred
0,0.000607,0.020747,1998-12-22,-0.011090,0.001908,-0.018838,0,0.0,1,,
1,0.020747,-0.001848,1998-12-23,0.001908,-0.003810,-0.001962,0,0.0,1,,
2,-0.001848,-0.000636,1998-12-24,-0.003810,-0.012241,-0.011605,0,0.0,1,,
3,-0.000636,0.013317,1998-12-28,-0.012241,-0.001898,-0.015215,0,0.0,1,,
4,0.013317,-0.007956,1998-12-29,-0.001898,0.001912,0.009868,1,0.0,1,,
...,...,...,...,...,...,...,...,...,...,...,...
6243,0.007199,0.006437,2023-10-16,0.007933,0.008030,0.001593,1,0.0,1,0.000705,-0.005732
6244,0.006437,-0.009811,2023-10-17,0.008030,-0.003269,0.006542,1,0.0,1,0.000697,0.010508
6245,-0.009811,-0.010034,2023-10-18,-0.003269,-0.009680,0.000354,1,0.0,1,0.000628,0.010662
6246,-0.010034,-0.011627,2023-10-19,-0.009680,-0.016439,-0.004812,0,0.0,1,0.000547,0.012174


### (4) Categorize

In [291]:
  df['pred_cate'] = df['pred'].apply(lambda x: 1 if x >= 0 else 0)
  df['pred_cate'] = df['pred_cate'].astype('category')

## train_test_split

In [292]:
# We use index to filter for time periods
df = df[(df['datetime']> time_start) & (df['datetime'] < time_end)]

# Drop datetime after using it
df.drop(columns=['datetime'], inplace=True)
df.shape

  df = df[(df['datetime']> time_start) & (df['datetime'] < time_end)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['datetime'], inplace=True)


(1005, 11)

In [293]:
# 1. Set up X, y
X = df['pred_cate']
y = df['excess_return_^GSPC_cate']

In [294]:
# 2. train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=config['seed'], shuffle=config['shuffle'])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=config['seed'], shuffle=config['shuffle'])

## Check data

In [295]:
def calculate_class_stats(y):
    class_counts = y.value_counts()
    total_samples = len(y)
    class_ratios = class_counts / total_samples
    return class_counts, class_ratios

# 計算類別數量和比例
train_class_counts, train_class_ratios = calculate_class_stats(y_train)
val_class_counts, val_class_ratios = calculate_class_stats(y_val)
test_class_counts, test_class_ratios = calculate_class_stats(y_test)

# 創建包含數量和比例的 DataFrame
class_stats = pd.DataFrame({
    'Train Count': train_class_counts,
    'Train Ratio': train_class_ratios,
    'Validation Count': val_class_counts,
    'Validation Ratio': val_class_ratios,
    'Test Count': test_class_counts,
    'Test Ratio': test_class_ratios
})

# 打印 DataFrame
print(class_stats)


   Train Count  Train Ratio  Validation Count  Validation Ratio  Test Count  \
1          322     0.533997               113          0.562189         113   
0          281     0.466003                88          0.437811          88   

   Test Ratio  
1    0.562189  
0    0.437811  


In [296]:
# Time period
print('Time Period')
print('From:', time_start)
print('To:', time_end, '\n')

# Sample size
print('Sample size:', X.shape[0])
# print('Feature:', X.columns, '\n')
print('Target:', y.name, '\n')
print('Train: Val: Test =', X_train.shape[0]/X.shape[0], X_test.shape[0]/X.shape[0], X_val.shape[0]/X.shape[0],
      X_train.shape[0], X_test.shape[0], X_val.shape[0])

Time Period
From: 2016-01-01 00:00:00
To: 2019-12-31 00:00:00 

Sample size: 1005
Target: excess_return_^GSPC_cate 

Train: Val: Test = 0.6 0.2 0.2 603 201 201


## Prediction

In [297]:
X_train.value_counts()

0    310
1    293
Name: pred_cate, dtype: int64

In [298]:
print(X_val)
print(y_val)

4887    1
4888    0
4889    1
4890    0
4891    0
       ..
5083    0
5084    0
5085    0
5086    1
5087    0
Name: pred_cate, Length: 201, dtype: category
Categories (2, int64): [0, 1]
4887    1
4888    0
4889    1
4890    1
4891    1
       ..
5083    1
5084    1
5085    0
5086    1
5087    1
Name: excess_return_^GSPC_cate, Length: 201, dtype: int64


In [299]:
def model(text, X, y):
  accuracy = accuracy_score(X, y)
  print(f"{text} acc：", accuracy)
model('Train     ', X_train, y_train)
model('Validation', X_val, y_val)
model('Test      ', X_test, y_test)


Train      acc： 0.5107794361525705
Validation acc： 0.3781094527363184
Test       acc： 0.472636815920398
