# 0 Setting

In [1]:
# Parameter Setting
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
folder_name = 'Research-(D5) Synthesized input model'
pretrained_model_name = 'model_microsoft.ckpt'

config = {
    'learning_rate': 1e-4,
    'batch_size': 32,
    'seq_length': 5,

    'shuffle': False,
    'criterion': torch.nn.MSELoss(),
    'seed': 42,
    'valid_ratio': 0.2,
    'test_ratio': 0.2,
    'max_length': 512,
    'n_epochs': 3000,
    'early_stop': 20,
    'device': device,

    'h_text_size': 64,
    'h_c_size': 1,
    'h_news_size': 1,
    'h_tech_size': 6,
    'h_size': 32,
}

company_list = [
    'microsoft',
    "amazon",
    "google",
    "tesla",
    "uber",
    "johnson & johnson",
    "alibaba",
    "intel",
    "ibm",
    "sony",
    "oracle",
    "paypal",
    "cisco",
    "airbnb",
    "nvidia",
    "qualcomm",
    "salesforce",
    "baidu",
    "adobe",
    "dell",
    "hp",
    "micron",
    "amd",
    "broadcom",
    "sap",
    "texas instruments",
    "applied material"
]

process_id = 1 #26

company_name = company_list[process_id]

config_2 = {'input_path': '/content/drive/MyDrive/Colab Notebooks/'+folder_name+'/data/2_'+company_name+'_for_model.csv',
            'save_path': '/content/drive/MyDrive/Colab Notebooks/'+folder_name+'/model_saved/model.ckpt',
            # 'pretrained_model_path': '/content/drive/MyDrive/Colab Notebooks/'+folder_name+'/premodel/' + pretrained_model_name,
            # 'continue_model_path': '/content/drive/MyDrive/Colab Notebooks/'+folder_name+'/model_saved/model_1.ckpt'
            }

feature = [
    # X_1
    'input_ids',
    'attention_mask',
    'section_dummy',
    'publication_dummy',

    # X_2
    # 1. tech indicator
    # 'Open',
    # 'High',
    # 'Low',
    # 'Close',
    # 'Volume',
    # 'Dividends',
    # 'Stock Splits',
    'today_return',
    # 'today_return_cate',
    # 'Sma',
    # 'Rsi',
    # 'Kd',
    # 'Ema_12',
    # 'Ema_26',
    # 'Macd',
    # 'sentiment',

    # 2. market index
    '^DJI',
    '^GSPC',
    '^NDX',
    '^IXIC',
    '^SOX',

    # y
    '1_day_return',
    # '2_day_return',
    # '3_day_return',
    # '4_day_return',
    # '5_day_return',
    # '1_day_return_cate',
    # '2_day_return_cate',
    # '3_day_return_cate',
    # '4_day_return_cate',
    # '5_day_return_cate',

    # Do not mark the datetime, it's for operation
    'datetime',
    ]

# All the news dataset
# time_start = '2016-01-01T00:00:00'
# time_end = '2020-04-02T00:00:00'

time_start = '2016-01-01T00:00:00'
time_end = '2019-12-31T00:00:00'

print(len(feature)-2)

10


## (1) Import

In [2]:
# Google
from google.colab import drive
drive.mount('/content/drive')

# pip installation
!pip install transformers

# Basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# PyTorch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from transformers import BertTokenizer, BertModel, BertConfig

# others
from datetime import datetime, timedelta
from tqdm import tqdm
from torchsummary import summary
import ast

Mounted at /content/drive
Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m120.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m91.9 MB/s[0m

In [3]:
def same_seed(seed):
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Set seed for reproducibility
same_seed(config['seed'])


In [4]:
df = pd.read_csv(config_2['input_path'])
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits',
       'today_return', 'today_return_cate', '1_day_return', '2_day_return',
       '3_day_return', '4_day_return', '5_day_return', '1_day_return_cate',
       '2_day_return_cate', '3_day_return_cate', '4_day_return_cate',
       '5_day_return_cate', 'Sma', 'Rsi', 'Kd', 'Ema_12', 'Ema_26', 'Macd',
       'datetime', '^DJI', '^GSPC', '^NDX', '^IXIC', '^SOX', 'input_ids',
       'attention_mask', 'section_dummy', 'publication_dummy', 'sentiment'],
      dtype='object')

In [5]:
df = df.sort_values(by='datetime', ascending=True)
df.shape

(1070, 36)

In [6]:
# Only contain selected features
df = df[feature]
df.columns

Index(['input_ids', 'attention_mask', 'section_dummy', 'publication_dummy',
       'today_return', '^DJI', '^GSPC', '^NDX', '^IXIC', '^SOX',
       '1_day_return', 'datetime'],
      dtype='object')

## (2) check nan

In [7]:
df[df.isna().any(axis=1)]

Unnamed: 0,input_ids,attention_mask,section_dummy,publication_dummy,today_return,^DJI,^GSPC,^NDX,^IXIC,^SOX,1_day_return,datetime
1068,,,,,-0.013073,-0.013373,-0.011041,-0.012725,-0.013261,-0.018769,0.00904,2020-04-01
1069,,,,,0.00904,0.02853,0.027805,0.025488,0.023213,0.04098,,2020-04-02


In [8]:
df.isnull().sum()

input_ids            2
attention_mask       2
section_dummy        2
publication_dummy    2
today_return         0
^DJI                 0
^GSPC                0
^NDX                 0
^IXIC                0
^SOX                 0
1_day_return         1
datetime             0
dtype: int64

In [9]:
df = df.dropna()
df = df.reset_index(drop=True)
df.isnull().sum()

input_ids            0
attention_mask       0
section_dummy        0
publication_dummy    0
today_return         0
^DJI                 0
^GSPC                0
^NDX                 0
^IXIC                0
^SOX                 0
1_day_return         0
datetime             0
dtype: int64

## (2) Time Period Selection

In [10]:
# We use index to filter for time periods
df = df[(df['datetime']> time_start) & (df['datetime'] < time_end)]

# Drop datetime after using it
df.drop(columns=['datetime'], inplace=True)
df.shape

(1006, 11)

## (3) Transform str back to list

In [11]:
# 将字符串转换回列表的函数
def string_to_list(s):
    return ast.literal_eval(s)

# 将列中的字符串转换回列表
df['input_ids'] = df['input_ids'].apply(string_to_list)
df['attention_mask'] = df['attention_mask'].apply(string_to_list)
df['section_dummy'] = df['section_dummy'].apply(string_to_list)
df['publication_dummy'] = df['publication_dummy'].apply(string_to_list)

## (3) List: Same amount of elements

In [12]:
input_ids_list_length = len(df['input_ids'][0][0])
attention_mask_list_length = len(df['attention_mask'][0][0])
section_dummy_list_length = len(df['section_dummy'][0][0])
publication_dummy_list_length = len(df['publication_dummy'][0][0])

In [13]:

# 找到最大的內部列表長度
max_inner_length = max(df['input_ids'].apply(len))

# 定義一個函數來填充內部列表，使其長度達到最大值
def pad_inner_list(lst, zero_list):
    while len(lst) < max_inner_length:
        lst.append(zero_list)  # 這裡可以填充任何你想要的值，例如 None

# 將 "input_ids" 列中的每個內部列表填充到最大長度
df['input_ids'].apply(pad_inner_list, zero_list=[0] * input_ids_list_length)
df['attention_mask'].apply(pad_inner_list, zero_list=[0] * attention_mask_list_length)
df['section_dummy'].apply(pad_inner_list, zero_list=[0] * section_dummy_list_length)
df['publication_dummy'].apply(pad_inner_list, zero_list=[0] * publication_dummy_list_length)
df

Unnamed: 0,input_ids,attention_mask,section_dummy,publication_dummy,today_return,^DJI,^GSPC,^NDX,^IXIC,^SOX,1_day_return
0,"[[101, 13423, 2023, 8983, 2006, 2115, 2849, 20...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...","[[0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, ...",-0.029408,-0.014739,-0.012531,0.002854,0.001111,0.008979,-0.020205
1,"[[101, 16716, 11503, 999, 2793, 19093, 999, 21...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, ...",-0.020205,0.000651,0.001455,-0.006175,-0.005370,-0.012003,0.017122
2,"[[101, 8042, 2003, 1037, 2600, 2732, 1999, 214...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, ...",0.017122,-0.014475,-0.010663,0.007758,0.004570,-0.011092,-0.022290
3,"[[101, 1005, 9979, 1997, 1996, 16517, 1005, 20...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, ...",-0.022290,-0.022161,-0.021271,-0.009984,-0.009917,-0.010667,-0.020350
4,"[[101, 1520, 5392, 11579, 4548, 2026, 10007, 1...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, ...",-0.020350,-0.010456,-0.012302,-0.015069,-0.016601,-0.019801,0.008588
...,...,...,...,...,...,...,...,...,...,...,...
1001,"[[101, 2129, 17163, 2006, 1996, 11142, 2150, 1...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,...","[[0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, ...",-0.002564,-0.001999,-0.000642,-0.000665,-0.000238,0.001204,0.037623
1002,"[[101, 2811, 17886, 1011, 2813, 2395, 3485, 10...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,...","[[0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, ...",0.037623,0.002871,0.003938,0.006924,0.005817,-0.000499,-0.006968
1003,"[[101, 1996, 4038, 2177, 2008, 2081, 4380, 100...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, ...",-0.006968,-0.001049,-0.002220,-0.003961,-0.004735,-0.006431,-0.014466
1004,"[[101, 1019, 2477, 2000, 2113, 2005, 2285, 286...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, ...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, ...",-0.014466,-0.006722,-0.005802,-0.006463,-0.006492,-0.006593,0.003170


In [14]:
df['input_ids']

0       [[101, 13423, 2023, 8983, 2006, 2115, 2849, 20...
1       [[101, 16716, 11503, 999, 2793, 19093, 999, 21...
2       [[101, 8042, 2003, 1037, 2600, 2732, 1999, 214...
3       [[101, 1005, 9979, 1997, 1996, 16517, 1005, 20...
4       [[101, 1520, 5392, 11579, 4548, 2026, 10007, 1...
                              ...                        
1001    [[101, 2129, 17163, 2006, 1996, 11142, 2150, 1...
1002    [[101, 2811, 17886, 1011, 2813, 2395, 3485, 10...
1003    [[101, 1996, 4038, 2177, 2008, 2081, 4380, 100...
1004    [[101, 1019, 2477, 2000, 2113, 2005, 2285, 286...
1005    [[101, 1019, 10036, 1006, 2003, 2232, 1007, 24...
Name: input_ids, Length: 1006, dtype: object

In [15]:
# 使用 apply 方法計算每個列表中元素的數量
temp = df['input_ids'].apply(len)

# 打印 DataFrame
temp

0       154
1       154
2       154
3       154
4       154
       ... 
1001    154
1002    154
1003    154
1004    154
1005    154
Name: input_ids, Length: 1006, dtype: int64

In [16]:
# 計算"input_ids"列中所有list的平均長度
average_length = df['input_ids'].apply(len).mean()

# 計算"input_ids"列中最長的list的長度
max_length = df['input_ids'].apply(len).max()

# 計算"input_ids"列中最短的list的長度
min_length = df['input_ids'].apply(len).min()

# 打印結果
print(f"平均長度: {average_length}")
print(f"最長長度: {max_length}")
print(f"最短長度: {min_length}")


平均長度: 154.0
最長長度: 154
最短長度: 154


## int to float (section, publication)

In [17]:
def recursive_convert_to_float(item):
    if isinstance(item, list):
        return [recursive_convert_to_float(x) if x is not None else None for x in item]
    else:
        return float(item) if item is not None else None

# 使用 apply 方法將函數應用於每個元素
df['section_dummy'] = df['section_dummy'].apply(recursive_convert_to_float)
df['publication_dummy'] = df['publication_dummy'].apply(recursive_convert_to_float)

## (4) Train_test_split

In [18]:
# 1. Set up X, y
to_remove_list = ['datetime', '1_day_return']

# Filter out values in to_remove_list
filtered_list = [x for x in feature if x not in to_remove_list]

X = df[filtered_list]
y = df['1_day_return']

In [19]:
# Check X, y shape
print('X:', X.shape)
print('y:', y.shape)

X: (1006, 10)
y: (1006,)


In [20]:
# 2. train_test_split
# val dataset for final examination

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['test_ratio'], random_state=config['seed'], shuffle=config['shuffle'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=config['valid_ratio'], random_state=config['seed'], shuffle=config['shuffle'])
X_train


Unnamed: 0,input_ids,attention_mask,section_dummy,publication_dummy,today_return,^DJI,^GSPC,^NDX,^IXIC,^SOX
0,"[[101, 13423, 2023, 8983, 2006, 2115, 2849, 20...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",-0.029408,-0.014739,-0.012531,0.002854,0.001111,0.008979
1,"[[101, 16716, 11503, 999, 2793, 19093, 999, 21...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",-0.020205,0.000651,0.001455,-0.006175,-0.005370,-0.012003
2,"[[101, 8042, 2003, 1037, 2600, 2732, 1999, 214...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.017122,-0.014475,-0.010663,0.007758,0.004570,-0.011092
3,"[[101, 1005, 9979, 1997, 1996, 16517, 1005, 20...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",-0.022290,-0.022161,-0.021271,-0.009984,-0.009917,-0.010667
4,"[[101, 1520, 5392, 11579, 4548, 2026, 10007, 1...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",-0.020350,-0.010456,-0.012302,-0.015069,-0.016601,-0.019801
...,...,...,...,...,...,...,...,...,...,...
638,"[[101, 2088, 15768, 7105, 2000, 3204, 4672, 10...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...",0.017869,0.003434,0.007245,0.015417,0.013306,0.022504
639,"[[101, 4012, 10526, 9530, 22119, 2015, 2000, 6...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...",-0.002749,0.002606,0.001519,-0.002311,-0.000635,0.001301
640,"[[101, 4701, 1005, 1055, 10093, 2401, 24545, 2...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...",-0.009014,-0.002969,-0.001737,-0.001670,-0.000575,-0.000095
641,"[[101, 2671, 4349, 2003, 2025, 2591, 4507, 102...","[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",-0.006197,0.000678,-0.000970,-0.003916,-0.002917,-0.004205


## (5) Scaler

In [21]:
scale_feature = [
    # X_2
    # 1. tech indicator
    # 'Open',
    # 'High',
    # 'Low',
    # 'Close',
    # 'Volume',
    # 'Dividends',
    # 'Stock Splits',
    'today_return',
    # 'Today_trend_cate',
    # 'Sma',
    # 'Rsi',
    # 'Kd',
    # 'Ema_12',
    # 'Ema_26',
    # 'Macd',

    # 2. market index
    '^DJI',
    '^GSPC',
    '^NDX',
    '^IXIC',
    '^SOX',
    # 'datetime'
    ]

def CustomScaler(X_train, X_val, X_test):
  scaler = MinMaxScaler()
  for i in scale_feature:

    # 對特定欄位進行標準化
    X_train_scaled = scaler.fit_transform(X_train[[i]])
    X_val_scaled = scaler.transform(X_val[[i]])
    X_test_scaled = scaler.transform(X_test[[i]])

    # 將標準化後的值重新賦值給 DataFrame
    X_train[i] = X_train_scaled
    X_val[i] = X_val_scaled
    X_test[i] = X_test_scaled

  return X_train, X_val, X_test

X_train, X_val, X_test = CustomScaler(X_train, X_val, X_test)

X_train

Unnamed: 0,input_ids,attention_mask,section_dummy,publication_dummy,today_return,^DJI,^GSPC,^NDX,^IXIC,^SOX
0,"[[101, 13423, 2023, 8983, 2006, 2115, 2849, 20...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.273178,0.355624,0.377236,0.525722,0.515721,0.591670
1,"[[101, 16716, 11503, 999, 2793, 19093, 999, 21...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.348177,0.557596,0.578553,0.424253,0.436452,0.374338
2,"[[101, 8042, 2003, 1037, 2600, 2732, 1999, 214...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.652391,0.359087,0.404126,0.580837,0.558035,0.383766
3,"[[101, 1005, 9979, 1997, 1996, 16517, 1005, 20...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.331185,0.258224,0.251419,0.381444,0.380842,0.388167
4,"[[101, 1520, 5392, 11579, 4548, 2026, 10007, 1...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.346998,0.411837,0.380522,0.324291,0.299087,0.293563
...,...,...,...,...,...,...,...,...,...,...
638,"[[101, 2088, 15768, 7105, 2000, 3204, 4672, 10...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...",0.658474,0.594123,0.661905,0.666913,0.664887,0.731776
639,"[[101, 4012, 10526, 9530, 22119, 2015, 2000, 6...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...",0.490444,0.583255,0.579473,0.467672,0.494370,0.512147
640,"[[101, 4701, 1005, 1055, 10093, 2401, 24545, 2...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,...",0.439387,0.510084,0.532605,0.474877,0.495106,0.497680
641,"[[101, 2671, 4349, 2003, 2025, 2591, 4507, 102...","[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.462340,0.557952,0.543649,0.449631,0.466455,0.455104


## (6) Check number

In [22]:
def calculate_class_stats(y):
    class_counts = y.value_counts()
    total_samples = len(y)
    class_ratios = class_counts / total_samples
    return class_counts, class_ratios

# 計算類別數量和比例
train_class_counts, train_class_ratios = calculate_class_stats(y_train)
val_class_counts, val_class_ratios = calculate_class_stats(y_val)
test_class_counts, test_class_ratios = calculate_class_stats(y_test)

# 創建包含數量和比例的 DataFrame
class_stats = pd.DataFrame({
    'Train Count': train_class_counts,
    'Train Ratio': train_class_ratios,
    'Validation Count': val_class_counts,
    'Validation Ratio': val_class_ratios,
    'Test Count': test_class_counts,
    'Test Ratio': test_class_ratios
})

# 打印 DataFrame
print(class_stats)


           Train Count  Train Ratio  Validation Count  Validation Ratio  \
-0.072964          NaN          NaN               1.0          0.006211   
-0.062927          1.0     0.001555               NaN               NaN   
-0.061735          NaN          NaN               1.0          0.006211   
-0.059755          NaN          NaN               1.0          0.006211   
-0.055383          1.0     0.001555               NaN               NaN   
...                ...          ...               ...               ...   
 0.049307          NaN          NaN               1.0          0.006211   
 0.050457          NaN          NaN               1.0          0.006211   
 0.052215          NaN          NaN               1.0          0.006211   
 0.059774          1.0     0.001555               NaN               NaN   
 0.074520          NaN          NaN               1.0          0.006211   

           Test Count  Test Ratio  
-0.072964         NaN         NaN  
-0.062927         NaN      

In [23]:
# Time period
print('Time Period')
print('From:', time_start)
print('To:', time_end, '\n')

# Sample size
print('Sample size:', X.shape[0])
print('Feature:', X.columns, '\n')
print('Target:', y.name, '\n')
print('Train: Val: Test =', X_train.shape[0], X_test.shape[0], X_val.shape[0])

Time Period
From: 2016-01-01T00:00:00
To: 2019-12-31T00:00:00 

Sample size: 1006
Feature: Index(['input_ids', 'attention_mask', 'section_dummy', 'publication_dummy',
       'today_return', '^DJI', '^GSPC', '^NDX', '^IXIC', '^SOX'],
      dtype='object') 

Target: 1_day_return 

Train: Val: Test = 643 202 161


# Model

## (1) Dataset & Dataloader

In [24]:
# Dataset
X_1 =['input_ids', 'attention_mask', 'section_dummy', 'publication_dummy']


class CustomDataset(Dataset):
    def __init__(self, X, y, config):
        # X_1
        self.input_ids = X['input_ids']
        self.attention_mask = X['attention_mask']
        self.section = X['section_dummy']
        self.publication = X['publication_dummy']

        # X_2
        self.X_2 = torch.tensor(X.drop(columns=X_1).values, dtype=torch.float)

        # y
        self.y = torch.tensor(y.values, dtype=torch.float)

        # other setting
        self.len = X.shape[0]
        self.seq_length = config['seq_length']

    def __getitem__(self,idx):
        # X_1
        input_ids_list = self.input_ids[idx : idx + self.seq_length].tolist() # All to list
        input_ids = torch.tensor(input_ids_list) # Then to tensor
        attention_mask_list = self.attention_mask[idx : idx + self.seq_length].tolist()
        attention_mask = torch.tensor(attention_mask_list)
        section_list = self.section[idx : idx + self.seq_length].tolist()
        section = torch.tensor(section_list)
        publication_list = self.publication[idx : idx + self.seq_length].tolist()
        publication = torch.tensor(publication_list)

        # X_2
        X_2 = self.X_2[idx : idx + self.seq_length]

        # 3. y
        y = self.y[idx + self.seq_length - 1]

        return input_ids, attention_mask, section, publication, X_2, y

    def __len__(self):
        return self.len - self.seq_length

In [25]:
# DataLoader
train_dataset = CustomDataset(X_train, y_train, config)
val_dataset = CustomDataset(X_val, y_val, config)
test_dataset = CustomDataset(X_test, y_test, config)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=config['shuffle'], drop_last=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=config['shuffle'], drop_last=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=config['shuffle'], drop_last=True, pin_memory=True)

# Check loader output
for batch in train_loader:
    input_ids, attention_mask, section, publication, X_2, y = batch

    # 打印批次数据的形状，以确保它们符合预期
    print("Input IDs shape:", input_ids.shape)
    print("Attention Mask shape:", attention_mask.shape)
    print("Section shape:", section.shape)
    print("Publication shape:", publication.shape)
    print("X_2 shape:", X_2.shape)
    print("Labels shape:", y.shape)

    # print("Input IDs:", input_ids)
    # print("Attention Mask:", attention_mask)
    # print("Section:", section)
    # print("Publication:", publication)
    # print("X_2:", X_2)
    # print("Labels:", y)

    break


Input IDs shape: torch.Size([32, 5, 154, 32])
Attention Mask shape: torch.Size([32, 5, 154, 32])
Section shape: torch.Size([32, 5, 154, 9])
Publication shape: torch.Size([32, 5, 154, 12])
X_2 shape: torch.Size([32, 5, 6])
Labels shape: torch.Size([32])


## (2) Model Architecture

### 1 Premodel

In [26]:
# # model_microsoft
# class PreModel(nn.Module):
#     def __init__(self, base_model):
#         super(PreModel, self).__init__()
#         self.base_model = BertModel.from_pretrained('bert-base-uncased')

#         # 2. FC layers
#         self.fc1 = nn.Linear(base_model.config.hidden_size, 256)
#         self.fc2 = nn.Linear(256, 64)
#         self.fc3 = nn.Linear(64, 1)
#         # self.fc4 = nn.Linear(256, 64)
#         # self.fc5 = nn.Linear(64, 5)

#     def forward(self, input_ids, attention_mask):
#         out = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

#         out = out.pooler_output
#         out = self.fc1(out)
#         out = self.fc2(out)
#         out = self.fc3(out)
#         # out = self.fc4(out)
#         # out = self.fc5(out)

#         return out

### 2 Prediction model

In [27]:
# # New structure: Save Computation
# class MyModel(nn.Module):
#     def __init__(self, base_model, config, element_size, section_length, publication_length, X_2_length, batch_size):
#         super(MyModel, self).__init__()
#         self.seq_length = config['seq_length']
#         self.batch_size = batch_size
#         self.element_size = element_size
#         self.abandon_tensor = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#                               0, 0, 0, 0, 0, 0, 0, 0], device=device)

#         self.section_length = section_length
#         self.publication_length = publication_length
#         self.config = config

#         # 1. News
#         # text
#         self.base_model = base_model
#         self.fc1 = nn.Linear(768, 64)
#         self.fc2 = nn.Linear(64, config['h_text_size'])

#         # c
#         self.fc_h_c = nn.Linear(section_length + publication_length, config['h_c_size'])

#         # news (concated)
#         self.fc_h_news = nn.Linear(config['h_text_size'] + config['h_c_size'], config['h_news_size'])

#         # 2. Indicator
#         # tech
#         self.fc_h_tech = nn.Linear(X_2_length, config['h_tech_size'])

#         # 1&2. converge
#         # news + tech
#         self.fc_h = nn.Linear(config['h_news_size'] + config['h_tech_size'], config['h_size'])

#         # 3. LSTM
#         self.lstm_1 = nn.LSTM(config['h_size'], 32, dropout=0.2, num_layers=4, batch_first=True, bidirectional=False)
#         # self.lstm_2 = nn.LSTM(32, 64, num_layers=1, batch_first=True, bidirectional=False)
#         # self.lstm_3 = nn.LSTM(64, 16, num_layers=1, batch_first=True, bidirectional=False)
#         # self.dropout = nn.Dropout(0.2)
#         self.sequential = nn.Sequential(
#             nn.Linear(32, 3)
#             # nn.Linear(16, 1)
#         )
#         self.dropout = nn.Dropout(0.2)


#     def forward(self, input_ids, attention_mask, section, publication, X_2):
#         # 1. News
#         flattened_input_ids = input_ids.view(-1, 32)
#         flattened_attention_mask = attention_mask.view(-1, 32)
#         flattened_section = section.view(-1, self.section_length)
#         flattened_publication = publication.view(-1, self.publication_length)

#         e_list = []
#         for i in range(0, flattened_input_ids.size(0), self.element_size):
#           # 获取当前组的子张量
#           sub_input_ids = flattened_input_ids[i:i+self.element_size]
#           sub_attention_mask = flattened_attention_mask[i:i+self.element_size]
#           sub_section = flattened_section[i:i+self.element_size]
#           sub_publication = flattened_publication[i:i+self.element_size]

#           non_zero_mask = (sub_input_ids != 0).any(dim=1)
#           non_zero_input_ids = sub_input_ids[non_zero_mask]
#           non_zero_attention_mask = sub_attention_mask[non_zero_mask]
#           non_zero_section = sub_section[non_zero_mask]
#           non_zero_publication = sub_publication[non_zero_mask]

#           # input_ids, attention_mask
#           out = self.base_model(input_ids=non_zero_input_ids, attention_mask=non_zero_attention_mask)
#           out = out.pooler_output
#           out = self.fc1(out)
#           h_text = self.fc2(out)

#           # section, publication
#           out = torch.cat([non_zero_section, non_zero_publication], dim=1)
#           h_c = self.fc_h_c(out)

#           # h_news
#           out = torch.cat([h_text, h_c], dim=1)
#           out = self.fc_h_news(out)
#           h_news = self.dropout(out)
#           element_mean = torch.mean(h_news, dim=0)
#           e_list.append(element_mean)

#         temp_tensor = torch.stack(e_list)
#         b_tensor = temp_tensor.view(self.batch_size, self.seq_length, self.config['h_news_size'])

#         # 2. Indicator
#         # h_tech
#         # h_tech = self.fc_h_tech(X_2)
#         h_tech = X_2

#         # h
#         out = torch.cat([b_tensor, h_tech], dim=2)
#         h = self.fc_h(out)

#         # 3. LSTM
#         out, _ = self.lstm_1(h)
#         out = out[:, -1, :]  # Get the last one of LSTM output for prediction of next-term

#         final_out = self.sequential(out)

#         return final_out


3. Simplified model
without c

In [28]:
# New structure: Save Computation
class MyModel(nn.Module):
    def __init__(self, base_model, config, element_size, section_length, publication_length, X_2_length, batch_size):
        super(MyModel, self).__init__()
        self.seq_length = config['seq_length']
        self.batch_size = batch_size
        self.element_size = element_size
        self.abandon_tensor = torch.tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                              0, 0, 0, 0, 0, 0, 0, 0], device=device)

        self.section_length = section_length
        self.publication_length = publication_length
        self.config = config

        # 1. News
        # text
        self.base_model = base_model
        self.fc1 = nn.Linear(768, 128)
        self.fc2 = nn.Linear(128, config['h_text_size'])
        # self.fc3 = nn.Linear(32, config['h_text_size'])

        # self.fc1 = nn.Linear(768, config['h_text_size'])



        # 1&2. converge
        # news + tech
        # self.fc_h = nn.Linear(config['h_text_size'] + config['h_tech_size'], config['h_size'])

        # 3. LSTM
        self.lstm_1 = nn.LSTM(config['h_text_size']+X_2_length, 32, dropout=0.2, num_layers=2, batch_first=True, bidirectional=False)
        self.sequential = nn.Sequential(
            nn.Linear(32, 1)
        )
        self.dropout = nn.Dropout(0.2)


    def forward(self, input_ids, attention_mask, section, publication, X_2):
        # 1. News
        flattened_input_ids = input_ids.view(-1, 32)
        flattened_attention_mask = attention_mask.view(-1, 32)
        flattened_section = section.view(-1, self.section_length)
        flattened_publication = publication.view(-1, self.publication_length)

        e_list = []
        for i in range(0, flattened_input_ids.size(0), self.element_size):
          # 获取当前组的子张量
          sub_input_ids = flattened_input_ids[i:i+self.element_size]
          sub_attention_mask = flattened_attention_mask[i:i+self.element_size]
          sub_section = flattened_section[i:i+self.element_size]
          sub_publication = flattened_publication[i:i+self.element_size]

          non_zero_mask = (sub_input_ids != 0).any(dim=1)
          non_zero_input_ids = sub_input_ids[non_zero_mask]
          non_zero_attention_mask = sub_attention_mask[non_zero_mask]
          non_zero_section = sub_section[non_zero_mask]
          non_zero_publication = sub_publication[non_zero_mask]


          # input_ids, attention_mask
          out = self.base_model(input_ids=non_zero_input_ids, attention_mask=non_zero_attention_mask)
          out = out.pooler_output
          # out = self.dropout(out)
          out = self.fc1(out)
          out = self.dropout(out)
          out = self.fc2(out)
          # out = self.dropout(out)
          # out = self.fc3(out)


          element_mean = torch.mean(out, dim=0) # h_text's mean
          e_list.append(element_mean)

        temp_tensor = torch.stack(e_list)
        b_tensor = temp_tensor.view(self.batch_size, self.seq_length, self.config['h_text_size'])


        # 2. Indicator
        # h_tech
        h_tech = X_2

        # h
        h = torch.cat([b_tensor, h_tech], dim=2)

        # 3. LSTM
        out, _ = self.lstm_1(h)
        out = out[:, -1, :]  # Get the last one of LSTM output for prediction of next-term

        final_out = self.sequential(out)

        return final_out


## (4) Load Model

### 1. Load pretrain model

In [29]:
# 載入預訓練模型

# base_model = PreModel(base_model)
# base_model.load_state_dict(torch.load(config_2['pretrained_model_path']))
bert_config = BertConfig(
    hidden_dropout_prob=0.2,
    )

base_model = BertModel.from_pretrained('ProsusAI/finbert', config=bert_config)
# base_model = BertModel.from_pretrained('bert-base-uncased', config=bert_config)


# Parameter
element_size = len(df['input_ids'][0])  # 114
section_length = len(df['section_dummy'][0][0])
publication_length = len(df['publication_dummy'][0][0])
X_2_length = len(feature) - 6


Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

### 2. Initiate Model

In [30]:
model = MyModel(base_model, config, element_size, section_length, publication_length, X_2_length, config['batch_size'])

model.to(device)

MyModel(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

### Extra: Contunue training

In [31]:
# model = MyModel(base_model, config, section_length, publication_length, X_2_length)
# model.load_state_dict(torch.load(config_2['continue_model_path']))
# model.to(device)

# # 分段訓練
# trainer2(model, train_loader, val_loader, config, device)
# trainer1(model, train_loader, val_loader, config, device)

## (5) Require_grad

In [32]:
# Freeze all layers
for param in model.base_model.parameters():
  param.requires_grad = False

# Unfreeze part of layers
# for param in model.base_model.encoder.layer[6].parameters():
#     param.requires_grad = True

# for param in model.base_model.encoder.layer[7].parameters():
#     param.requires_grad = True

# for param in model.base_model.encoder.layer[8].parameters():
#     param.requires_grad = True

# for param in model.base_model.encoder.layer[9].parameters():
#     param.requires_grad = True

# for param in model.base_model.encoder.layer[10].parameters():
#     param.requires_grad = True

for param in model.base_model.encoder.layer[11].parameters():
    param.requires_grad = True

# for param in model.base_model.fc1.parameters():
#     param.requires_grad = True

# for param in model.base_model.fc2.parameters():
#     param.requires_grad = True

# for param in model.base_model.fc3.parameters():
    # param.requires_grad = True

# Check requires_grad status
for name, param in model.named_parameters():
    print(name, param.requires_grad)

base_model.embeddings.word_embeddings.weight False
base_model.embeddings.position_embeddings.weight False
base_model.embeddings.token_type_embeddings.weight False
base_model.embeddings.LayerNorm.weight False
base_model.embeddings.LayerNorm.bias False
base_model.encoder.layer.0.attention.self.query.weight False
base_model.encoder.layer.0.attention.self.query.bias False
base_model.encoder.layer.0.attention.self.key.weight False
base_model.encoder.layer.0.attention.self.key.bias False
base_model.encoder.layer.0.attention.self.value.weight False
base_model.encoder.layer.0.attention.self.value.bias False
base_model.encoder.layer.0.attention.output.dense.weight False
base_model.encoder.layer.0.attention.output.dense.bias False
base_model.encoder.layer.0.attention.output.LayerNorm.weight False
base_model.encoder.layer.0.attention.output.LayerNorm.bias False
base_model.encoder.layer.0.intermediate.dense.weight False
base_model.encoder.layer.0.intermediate.dense.bias False
base_model.encoder.la

# Training

In [33]:
def trainer(model, train_loader, val_loader, config, device):

    criterion = config['criterion']
    # optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])


    writer = SummaryWriter()  # Writer of tensoboard.
    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    # 1. Training
    for epoch in range(n_epochs):
      model.train()  # Set the model to training mode
      loss_record = []

      train_pbar = tqdm(train_loader, position=0, leave=True)  # tqdm is a package to visualize your training progress.
      for input_ids, attention_mask, section, publication, X_2, y in train_loader:
        optimizer.zero_grad()  # Set gradient to zero

        # Forward pass
        input_ids, attention_mask, section, publication, X_2, y = input_ids.to(device), attention_mask.to(device), section.to(device), publication.to(device), X_2.to(device), y.to(device)
        pred = model(input_ids, attention_mask, section, publication, X_2)
        loss = criterion(pred, y)
        loss.backward()                     # Compute gradient(backpropagation).
        optimizer.step()                    # Update parameters.
        step += 1
        loss_record.append(loss.detach().item())

        # Display current epoch number and loss on tqdm progress bar.
        train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
        train_pbar.set_postfix({'loss': loss.detach().item()})

      mean_train_loss = sum(loss_record)/len(loss_record)
      writer.add_scalar('Loss/train', mean_train_loss, step)

      # 2. Evaluation
      model.eval() # Set your model to evaluation mode.
      loss_record = []
      for input_ids, attention_mask, section, publication, X_2, y in val_loader:
          input_ids, attention_mask, section, publication, X_2, y = input_ids.to(device), attention_mask.to(device), section.to(device), publication.to(device), X_2.to(device), y.to(device)
          with torch.no_grad():
              pred = model(input_ids, attention_mask, section, publication, X_2)
              loss = criterion(pred, y)
              loss_record.append(loss.item())

      # Mean
      mean_valid_loss = sum(loss_record)/len(loss_record)
      print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.8f}, Valid loss: {mean_valid_loss:.8f}')
      writer.add_scalar('Loss/valid', mean_valid_loss, step)

      # 3. Judge of saving model
      if mean_valid_loss < best_loss:
          best_loss = mean_valid_loss
          torch.save(model.state_dict(), config_2['save_path']) # Save your best model
          print('Saving model with loss {:.3f}...'.format(best_loss))
          early_stop_count = 0
      else:
          early_stop_count += 1

      if early_stop_count >= config['early_stop']:
          print('\nModel is not improving, so we halt the training session.')
          return


In [34]:
# 全部訓練
trainer(model, train_loader, val_loader, config, device)


  return F.mse_loss(input, target, reduction=self.reduction)
Epoch [1/3000]:   0%|          | 0/19 [03:47<?, ?it/s, loss=0.000665]

Epoch [1/3000]: Train loss: 0.01654761, Valid loss: 0.00051509
Saving model with loss 0.001...


Epoch [1/3000]:   0%|          | 0/19 [04:59<?, ?it/s, loss=0.000665]
Epoch [2/3000]:   0%|          | 0/19 [03:47<?, ?it/s, loss=0.000437]

Epoch [2/3000]: Train loss: 0.00091351, Valid loss: 0.00053429


Epoch [2/3000]:   0%|          | 0/19 [04:47<?, ?it/s, loss=0.000437]
Epoch [3/3000]:   0%|          | 0/19 [03:46<?, ?it/s, loss=0.000556]

Epoch [3/3000]: Train loss: 0.00059273, Valid loss: 0.00051486
Saving model with loss 0.001...


Epoch [3/3000]:   0%|          | 0/19 [04:48<?, ?it/s, loss=0.000556]
Epoch [4/3000]:   0%|          | 0/19 [03:46<?, ?it/s, loss=0.000449]

Epoch [4/3000]: Train loss: 0.00053925, Valid loss: 0.00053798


Epoch [4/3000]:   0%|          | 0/19 [04:45<?, ?it/s, loss=0.000449]
Epoch [5/3000]:   0%|          | 0/19 [03:46<?, ?it/s, loss=0.000358]

Epoch [5/3000]: Train loss: 0.00045567, Valid loss: 0.00051765


Epoch [5/3000]:   0%|          | 0/19 [04:45<?, ?it/s, loss=0.000358]
Epoch [6/3000]:   0%|          | 0/19 [03:48<?, ?it/s, loss=0.000396]

Epoch [6/3000]: Train loss: 0.00044811, Valid loss: 0.00052084


Epoch [6/3000]:   0%|          | 0/19 [04:48<?, ?it/s, loss=0.000396]
Epoch [7/3000]:   0%|          | 0/19 [00:41<?, ?it/s, loss=0.000329]

KeyboardInterrupt: ignored

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/

# Evaluate

In [35]:
# Evaluation Dataloader
con_train_loader = DataLoader(train_dataset, batch_size=1, shuffle=config['shuffle'], pin_memory=True)
con_val_loader = DataLoader(val_dataset, batch_size=1, shuffle=config['shuffle'], pin_memory=True)
con_test_loader = DataLoader(test_dataset, batch_size=1, shuffle=config['shuffle'], pin_memory=True)

loss_function = config['criterion']

model = MyModel(base_model, config, element_size, section_length, publication_length, X_2_length, batch_size=1)
model.load_state_dict(torch.load(config_2['save_path']))
model.to(device)
model.eval()

def evaluate(dataloader, name):
  loss_record = []
  # 1. Train part
  with torch.no_grad():
    predicted_labels_list = []
    targets_list = []
    for input_ids, attention_mask, section, publication, X_2, y in dataloader:
      input_ids, attention_mask, section, publication, X_2, y = input_ids.to(device), attention_mask.to(device), section.to(device), publication.to(device), X_2.to(device), y.to(device)
      pred = model(input_ids, attention_mask, section, publication, X_2)
      loss = loss_function(pred, y)

      # Save to lists
      loss_record.append(loss.detach().item())
    mean_train_loss = sum(loss_record)/len(loss_record)

  print(name, mean_train_loss)

print('My Model')
evaluate(con_train_loader, 'Train Loss: ')
evaluate(con_val_loader, 'Val Loss:   ')
evaluate(con_test_loader, 'Test Loss:  ')

# MSE
train_loss = (y_train**2).mean()
val_loss = (y_val**2).mean()
test_loss = (y_test**2).mean()
print('MSE')
print('Train Loss: ', train_loss)
print('Val Loss:   ', val_loss)
print('Test Loss:  ', test_loss)

#MAE
train_loss = np.abs(y_train).mean()
val_loss = np.abs(y_val).mean()
test_loss = np.abs(y_test).mean()
print("MAE")
print('Train Loss: ', train_loss)
print('Val Loss:   ', val_loss)
print('Test Loss:  ', test_loss)

My Model


  return F.mse_loss(input, target, reduction=self.reduction)


Train Loss:  0.00017137169543935272
Val Loss:    0.0004469222280379566
Test Loss:   0.00010382995561291009
MSE
Train Loss:  0.0001713896567262344
Val Loss:    0.00044773663293293844
Test Loss:   0.0001056560529369335
MAE
Train Loss:  0.00937602729399374
Val Loss:    0.015078638429645502
Test Loss:   0.007678729042802529
