<a href="https://colab.research.google.com/github/yuann403/financial/blob/main/week11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* 設計一個多模態模型，採用**(a)早期融合**、(b)晚期融合或(c)中期融合的方式進行數據整合（擇一實現）。

* 多模態資料來源可包括以下組合：
    **新聞情緒指標 + 股價資料**

* 模型目標可針對**分類任務**（如股價漲跌預測）

新聞+股價資料集：https://www.kaggle.com/competitions/stock-market-prediction-and-sentimental-analysis/overview

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import classification_report

In [2]:
# 加載數據
news_data = pd.read_csv('/content/Combined_News_DJIA(train).csv')
stock_data = pd.read_csv('/content/DJIA_table(train).csv')

# # 查看數據結構
# print("新聞數據：")
# print(news_data.head())
# print(news_data.info())

# print("股價數據：")
# print(stock_data.head())
# print(stock_data.info())

In [3]:
# 假設需要合併的欄位是 'Top1', 'Top2', 'Top3', ..., 'TopN'
news_columns = [col for col in news_data.columns if 'Top' in col]  # 自動檢索包含 'Top' 的列名
print("需要合併的欄位：", news_columns)

# 合併欄位
news_data['News'] = news_data[news_columns].fillna('').agg(' '.join, axis=1)

# 查看新的數據結構
print(news_data[['News']].head())

需要合併的欄位： ['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25']
                                                News
0  b"Georgia 'downs two Russian warplanes' as cou...
1  b'Why wont America and Nato help us? If they w...
2  b'Remember that adorable 9-year-old who sang a...
3  b' U.S. refuses Israel weapons to attack Iran:...
4  b'All the experts admit that we should legalis...


In [4]:
# 添加移動平均線
stock_data['SMA_5'] = stock_data['Close'].rolling(window=5).mean()
stock_data['SMA_10'] = stock_data['Close'].rolling(window=10).mean()

# 計算 RSI
def calculate_rsi(data, window=14):
    delta = data['Close'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

stock_data['RSI'] = calculate_rsi(stock_data)

In [5]:
# 統一日期格式
news_data['Date'] = pd.to_datetime(news_data['Date'])
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

# 處理空值和重複值
news_data = news_data.drop_duplicates(subset='Date').dropna(subset=['Date'])
stock_data = stock_data.drop_duplicates(subset='Date').dropna(subset=['Date'])

# 過濾日期範圍的交集
common_dates = set(news_data['Date']).intersection(set(stock_data['Date']))
news_data = news_data[news_data['Date'].isin(common_dates)]
stock_data = stock_data[stock_data['Date'].isin(common_dates)]

# 合併數據
merged_data = pd.merge(news_data, stock_data, on='Date', how='inner')

  stock_data['Date'] = pd.to_datetime(stock_data['Date'])


In [6]:
# 確認需要刪除的欄位
columns_to_drop = [col for col in merged_data.columns if col.startswith('Top')]

# 刪除欄位
merged_data = merged_data.drop(columns=columns_to_drop, axis=1)

# 查看刪除後的數據
print("刪除後的數據:")
print(merged_data.columns)

刪除後的數據:
Index(['Date', 'Label', 'News', 'Open', 'High', 'Low', 'Close', 'Volume',
       'Adj Close', 'SMA_5', 'SMA_10', 'RSI'],
      dtype='object')


In [7]:
# 提取特徵與標籤
emotion_features = ['News']  # 假設新聞文本列名為 'News'
stock_features = ['Close','RSI']  # 假設股價數據列
label_column = 'Label'  # 假設分類標籤列名為 'Label'

# 確認數據是否完整
data = merged_data.dropna(subset=emotion_features + stock_features + [label_column])

# 提取特徵與標籤
news_texts = data[emotion_features[0]].values  # 新聞文本
stock_data = data[stock_features].values  # 股價數據
labels = data[label_column].values  # 分類標籤

In [8]:
# 標準化股價數據
scaler = StandardScaler()
scaled_stock_data = scaler.fit_transform(stock_data)

In [9]:
# 確保數據一致
min_length = min(len(news_texts), len(scaled_stock_data), len(labels))

news_texts = news_texts[:min_length]
scaled_stock_data = scaled_stock_data[:min_length]
labels = labels[:min_length]

In [10]:
# 加載 BERT 分詞器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 分詞新聞文本
def tokenize_news(texts):
    return tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

tokenized_news = tokenize_news(news_texts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
print(f"tokenized_news size: {len(tokenized_news['input_ids']) if 'input_ids' in tokenized_news else len(tokenized_news)}")

tokenized_news size: 1850


In [12]:
# 確保 tokenized_news 的樣本數一致
print(f"Tokenized news samples: {len(tokenized_news['input_ids'])}")
print(f"Stock data samples: {len(scaled_stock_data)}")
print(f"Labels samples: {len(labels)}")

Tokenized news samples: 1850
Stock data samples: 1850
Labels samples: 1850


In [13]:
input_ids = tokenized_news['input_ids']
attention_mask = tokenized_news['attention_mask']

In [14]:
# 分割數據集
X_train_ids, X_test_ids, X_train_mask, X_test_mask, X_train_stock, X_test_stock, y_train, y_test = train_test_split(
    input_ids, attention_mask, scaled_stock_data, labels, test_size=0.2, random_state=42
)

In [15]:
class MidFusionModel(nn.Module):
    def __init__(self, bert_model_name, stock_feature_dim, hidden_dim, num_classes):
        super(MidFusionModel, self).__init__()
        # BERT 模型
        self.bert = BertModel.from_pretrained(bert_model_name)
        # 股價特徵處理
        self.stock_fc = nn.Sequential(
            nn.Linear(stock_feature_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        # 結合後的分類器
        self.classifier = nn.Sequential(
            nn.Linear(768 + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask, stock_features):
        # BERT 特徵提取
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = bert_outputs.pooler_output

        # 股價特徵提取
        stock_features = self.stock_fc(stock_features)

        # 拼接特徵
        combined_features = torch.cat((text_features, stock_features), dim=1)
        output = self.classifier(combined_features)
        return output

In [16]:
# 構建 TensorDataset
train_dataset = TensorDataset(
    torch.tensor(X_train_ids, dtype=torch.long),
    torch.tensor(X_train_mask, dtype=torch.long),
    torch.tensor(X_train_stock, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.long)
)

test_dataset = TensorDataset(
    torch.tensor(X_test_ids, dtype=torch.long),
    torch.tensor(X_test_mask, dtype=torch.long),
    torch.tensor(X_test_stock, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.long)
)

# 創建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

  torch.tensor(X_train_ids, dtype=torch.long),
  torch.tensor(X_train_mask, dtype=torch.long),
  torch.tensor(X_test_ids, dtype=torch.long),
  torch.tensor(X_test_mask, dtype=torch.long),


In [17]:
# 初始化模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MidFusionModel("bert-base-uncased", stock_feature_dim=X_train_stock.shape[1], hidden_dim=128, num_classes=2)
model.to(device)

# 訓練過程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)  # 每5個epoch將學習率減半

epochs = 20
for epoch in range(epochs):
  model.train()
  total_loss = 0
  for input_ids, attention_mask, stock_features, labels in train_loader:
    input_ids, attention_mask, stock_features, labels = (
        input_ids.to(device), attention_mask.to(device), stock_features.to(device), labels.to(device)
    )
    outputs = model(input_ids, attention_mask, stock_features)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
  scheduler.step()  # 調整學習率
  print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/20, Loss: 32.8632
Epoch 2/20, Loss: 32.9233
Epoch 3/20, Loss: 32.7194
Epoch 4/20, Loss: 32.6973
Epoch 5/20, Loss: 32.7677
Epoch 6/20, Loss: 32.7089
Epoch 7/20, Loss: 32.7026
Epoch 8/20, Loss: 32.6130
Epoch 9/20, Loss: 32.6122
Epoch 10/20, Loss: 32.5617
Epoch 11/20, Loss: 32.5570
Epoch 12/20, Loss: 32.5387
Epoch 13/20, Loss: 32.5325
Epoch 14/20, Loss: 32.5015
Epoch 15/20, Loss: 32.5647
Epoch 16/20, Loss: 32.5407
Epoch 17/20, Loss: 32.5021
Epoch 18/20, Loss: 32.5392
Epoch 19/20, Loss: 32.5187
Epoch 20/20, Loss: 32.4878


In [18]:
# 測試模型
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for input_ids, attention_mask, stock_features, labels in test_loader:
        input_ids, attention_mask, stock_features, labels = (
            input_ids.to(device), attention_mask.to(device), stock_features.to(device), labels.to(device)
        )
        outputs = model(input_ids, attention_mask, stock_features)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# 評估
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, target_names=["Down", "Up"]))


              precision    recall  f1-score   support

        Down       0.00      0.00      0.00       156
          Up       0.58      1.00      0.73       214

    accuracy                           0.58       370
   macro avg       0.29      0.50      0.37       370
weighted avg       0.33      0.58      0.42       370



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
from sklearn.metrics import accuracy_score

# 計算準確度
accuracy = accuracy_score(all_labels, all_preds)
print(f"模型準確度: {accuracy:.2f}")


模型準確度: 0.58
