In [None]:
!pip install pandas numpy xgboost scikit-learn finlab openfe

In [None]:
BP_ratio = data.get("price_earning_ratio:股價淨值比")
net_income = data.get('fundamental_features:經常稅後淨利')
holder_equity = data.get('financial_statement:股東權益總額')
roe = net_income/holder_equity
# ... 添加所有其他數據獲取代碼 ...

In [None]:
def above_ma_rank(n):
    return ((close > close.rolling(5).mean()).rolling(n).sum()).rank(axis=1, pct=True)


def avg(n):
    return (close / close.average(5)).rank(axis=1, pct=True)

# ... 添加所有其他函數定義 ...

In [None]:
def create_features(functions, windows):
    features = {}
    for func_name, func in functions.items():
        for window in windows.get(func_name, []):
            feature_name = f"{func_name}{window}"
            features[feature_name] = func(window)
    return features


# 定義函數和對應的時間窗口
functions = {
    'vol': vol.average,
    'avg': avg,
    # ... 添加所有其他函數 ...
}

windows = {
    'vol': [20, 60],
    'avg': [5, 20, 60, 120, 240],
    # ... 添加所有其他窗口 ...
}

# 生成特徵
features = create_features(functions, windows)

# 添加其他特殊特徵
additional_features = {
    'cap': cap,
    'roe': roe,
    'amt': amt.average(5),
    # ... 添加所有其他特殊特徵 ...
}

features.update(additional_features)

# 使用 mlf.combine 合併所有特徵
features = mlf.combine(features, resample='W')

In [None]:
from finlab.ml import label as mll

labels = mll.excess_over_mean(features.index, resample='4W')

is_train = features.index.get_level_values('datetime') < '2021-01-01'
notna = (features.isna().sum(axis=1) == 0) & (labels.notna())
train_x = features.loc[is_train & notna]
train_y = labels.loc[is_train & notna]
test_x = features.loc[~is_train & notna]
test_y = labels.loc[~is_train & notna]

In [None]:
n_jobs = 4  # 您可以根據您的 CPU 核心數調整這個值

ofe = OpenFE()
features2 = ofe.fit(data=train_x.reset_index(drop=True),
                    label=train_y.reset_index(drop=True),
                    n_jobs=n_jobs, verbose=False, n_data_blocks=128, min_candidate_features=100)

train_x2, test_x2 = transform(train_x.reset_index(drop=True),
                              test_x.reset_index(drop=True),
                              features2, n_jobs=1)
train_x2.index = train_y.index
test_x2.index = test_y.index

In [None]:
import matplotlib.pyplot as plt

new_features = []
for i, feature in enumerate(ofe.new_features_list):
    formula = tree_to_formula(feature)
    f_series = feature.calculate(features)

    notna = f_series.notna() & labels.notna()
    ic = np.corrcoef(f_series[notna].values, labels[notna].values)[0][1]

    new_features.append((formula, ic))

# 根據 IC 值排序，從高到低
sorted_features = sorted(new_features, key=lambda x: abs(x[1]), reverse=True)

# 顯示前10個特徵及其 IC 值
for formula, ic in sorted_features[:10]:
    print(f"Formula: {formula}, IC: {ic}")

# 添加可視化
plt.figure(figsize=(12, 6))
plt.bar(range(len(sorted_features)), [abs(ic) for _, ic in sorted_features])
plt.title('Feature Importance (Absolute IC)')
plt.xlabel('Feature Index')
plt.ylabel('|IC|')
plt.show()

In [None]:
# 合併原始特徵和 OpenFE 生成的新特徵
train_combined = pd.concat([train_x, train_x2], axis=1)
test_combined = pd.concat([test_x, test_x2], axis=1)

# 準備 XGBoost 數據集
dtrain = xgb.DMatrix(train_combined, label=train_y)
dtest = xgb.DMatrix(test_combined, label=test_y)

# 設置 XGBoost 參數
params = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
}

# 訓練 XGBoost 模型
num_round = 100
bst = xgb.train(params, dtrain, num_round, evals=[(dtest, "Test")])

# 預測
preds = bst.predict(dtest)

# 評估模型
rmse = np.sqrt(mean_squared_error(test_y, preds))
print(f"Test RMSE: {rmse}")

# 特徵重要性
importance = bst.get_score(importance_type='gain')
importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print("Top 10 important features:")
for feat, score in importance[:10]:
    print(f"{feat}: {score}")

# 添加特徵重要性可視化
plt.figure(figsize=(12, 6))
plt.bar(range(len(importance)), [score for _, score in importance])
plt.title('XGBoost Feature Importance')
plt.xlabel('Feature Index')
plt.ylabel('Importance Score')
plt.show()

In [None]:
# 分析預測結果
from sklearn.metrics import r2_score, mean_absolute_error
plt.figure(figsize=(12, 6))
plt.scatter(test_y, preds)
plt.plot([test_y.min(), test_y.max()], [
         test_y.min(), test_y.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted')
plt.show()

# 計算並打印其他評估指標

r2 = r2_score(test_y, preds)
mae = mean_absolute_error(test_y, preds)

print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")

# 討論結果
print("結果分析：")
print("1. 模型性能：根據RMSE、R-squared和MAE，我們可以看出...")
print("2. 重要特徵：從特徵重要性分析中，我們可以發現...")
print("3. OpenFE的貢獻：通過比較原始特徵和OpenFE生成的新特徵，我們可以得出...")
print("4. 改進方向：基於以上分析，我們可以考慮以下幾個改進方向：")
print("   a. 調整XGBoost的超參數")
print("   b. 進一步優化特徵工程過程")
print("   c. 嘗試其他模型，如LightGBM或者集成方法")