<a href="https://colab.research.google.com/github/tsengcc2023/Financial-Big-Data-Analysis/blob/main/week_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

設計一個多模態模型，採用中期融合的方式進行數據整合。

多模態資料來源包括新聞情緒指標 + 股價資料，模型目標針對分類任務（如股價漲跌預測）



新聞+股價資料集：https://www.kaggle.com/competitions/stock-market-prediction-and-sentimental-analysis/data


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 資料預處理

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 加載數據
news_data = pd.read_csv('/content/drive/MyDrive/NCHU/113-1/金融大數據分析/week11_dataset/Combined_News_DJIA(train).csv')  # 新聞數據
stock_data = pd.read_csv('/content/drive/MyDrive/NCHU/113-1/金融大數據分析/week11_dataset/DJIA_table(train).csv')  # 股價數據

# 日期處理
news_data['Date'] = pd.to_datetime(news_data['Date'])
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

# 合併數據集（基於日期）
merged_data = pd.merge(news_data, stock_data, on='Date')

# 文本處理：合併 Top1~Top25 的新聞標題
merged_data['combined_news'] = merged_data.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# 文本向量化（TF-IDF）
vectorizer = TfidfVectorizer(max_features=5000)  # 限制最多5000個特徵
text_features = vectorizer.fit_transform(merged_data['combined_news']).toarray()

# 股價特徵
stock_features = merged_data[['Open', 'High', 'Low', 'Close', 'Volume']].values

# 標籤（漲跌分類）
labels = merged_data['Label'].values

# 時間序列化股價數據
def create_time_series(data, time_steps):
    sequences = []
    for i in range(len(data) - time_steps):
        sequences.append(data[i:i + time_steps])
    return np.array(sequences)

time_steps = 30  # 時間序列窗口大小
stock_time_series = create_time_series(stock_features, time_steps)
labels = labels[time_steps:]
text_features = text_features[time_steps:]

# 訓練測試集切割
X_train_text, X_test_text, X_train_stock, X_test_stock, y_train, y_test = train_test_split(
    text_features, stock_time_series, labels, test_size=0.2, random_state=42)

print(f"文本特徵形狀: {X_train_text.shape}, 時間序列形狀: {X_train_stock.shape}")


  stock_data['Date'] = pd.to_datetime(stock_data['Date'])


文本特徵形狀: (1466, 5000), 時間序列形狀: (1466, 30, 5)


# 模型設計

In [3]:
from tensorflow.keras.layers import Input, Dense, Dropout

def build_text_branch(input_dim):
    input_text = Input(shape=(input_dim,))
    x = Dense(128, activation='relu')(input_text)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    return input_text, x

In [4]:
from tensorflow.keras.layers import LSTM

def build_stock_branch(time_steps, feature_dim):
    input_stock = Input(shape=(time_steps, feature_dim))
    x = LSTM(64, return_sequences=False)(input_stock)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    return input_stock, x

In [5]:
from tensorflow.keras.layers import Concatenate

def build_multimodal_model(text_dim, time_steps, stock_dim):
    # 文本分支
    input_text, text_features = build_text_branch(text_dim)

    # 股價分支
    input_stock, stock_features = build_stock_branch(time_steps, stock_dim)

    # 中期融合
    combined = Concatenate()([text_features, stock_features])

    # 分類頭部
    x = Dense(128, activation='relu')(combined)
    x = Dropout(0.3)(x)
    output = Dense(1, activation='sigmoid')(x)  # 二元分類（漲跌）

    # 模型
    model = Model(inputs=[input_text, input_stock], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 模型訓練

In [7]:
from tensorflow.keras.models import Model
text_dim = X_train_text.shape[1]
stock_dim = X_train_stock.shape[2]
model = build_multimodal_model(text_dim, time_steps, stock_dim)

# 查看模型結構
model.summary()

In [8]:
history = model.fit(
    [X_train_text, X_train_stock],
    y_train,
    validation_data=([X_test_text, X_test_stock], y_test),
    epochs=20,
    batch_size=32
)

Epoch 1/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.5303 - loss: 0.6943 - val_accuracy: 0.4796 - val_loss: 0.6970
Epoch 2/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.5071 - loss: 0.6971 - val_accuracy: 0.5014 - val_loss: 0.6939
Epoch 3/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.6859 - loss: 0.6203 - val_accuracy: 0.5095 - val_loss: 0.8040
Epoch 4/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.9092 - loss: 0.2734 - val_accuracy: 0.5041 - val_loss: 1.2766
Epoch 5/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 73ms/step - accuracy: 0.9885 - loss: 0.0563 - val_accuracy: 0.5095 - val_loss: 1.8120
Epoch 6/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 61ms/step - accuracy: 0.9995 - loss: 0.0115 - val_accuracy: 0.5204 - val_loss: 2.1854
Epoch 7/20
[1m46/46[0m [32m━━━━

In [9]:
test_loss, test_accuracy = model.evaluate([X_test_text, X_test_stock], y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5511 - loss: 3.3007
Test Accuracy: 0.52
