In [None]:
import json
import urllib.request
from datetime import datetime, timezone, timedelta
import time

def fetch_gnews(api_key, query, start_date):
    start_date_iso = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).isoformat().replace("+00:00", "Z")
    end_date_iso = (datetime.strptime(start_date, "%Y-%m-%d") + timedelta(days=1)).replace(tzinfo=timezone.utc).isoformat().replace("+00:00", "Z")

    url = f"https://gnews.io/api/v4/search?q={query}&lang=en&country=us&max=10&from={start_date_iso}&to={end_date_iso}&sortby=relevance&apikey={api_key}"

    with urllib.request.urlopen(url) as response:
        data = json.loads(response.read().decode("utf-8"))
        articles = data.get("articles", [])

    return {start_date: articles}

def fetch_news_over_time(api_key, query, start_date, end_date, output_file="gnews_master_nvd.json"):
    current_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    try:
        with open(output_file, "r") as f:
            master_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        master_data = {}

    while current_date >= end_date:
        date_str = current_date.strftime("%Y-%m-%d")
        if date_str not in master_data:
            print(f"Fetching news for {date_str}...")
            master_data.update(fetch_gnews(api_key, query, date_str))


            with open(output_file, "w") as f:
                json.dump(master_data, f, indent=4)

            time.sleep(1) #这里稍微停一下，防止开始计费
        else:
            print(f"Skipping {date_str}, already fetched.")

        current_date -= timedelta(days=1)

    print("saved: gnews_master.json")


api_key = "1d754b596757088d38ea69b73ae145c2" #这里的API要自己去上面的网址注册一个，免费版一天只能爬100天的新闻
query = "NVIDIA" #这是搜索新闻的关键字
start_date = "2024-12-31" #自己算一下日期，别超过100天
end_date = "2024-09-23" #最好是爬2024年8月1日到2025年4月8日的，苹果的股票我用的这个区间

fetch_news_over_time(api_key, query, start_date, end_date)

In [1]:
import json

with open('gnews_master_nvd0101-0408.json', 'r', encoding='utf-8') as f1, open('gnews_master_nvd20240923-20241231.json', 'r', encoding='utf-8') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)
#这段是用来合并两个数据的，理论上需要爬至少200天以上的数据，因为还要切分训练集和测试集
merged = {}

all_dates = set(data1.keys()).union(data2.keys())

for date in all_dates:
    list1 = data1.get(date, [])
    list2 = data2.get(date, [])
    merged[date] = list1 + list2

merged_sorted = dict(sorted(merged.items(), key=lambda x: x[0], reverse=True))

with open('merged.json', 'w', encoding='utf-8') as f_out:
    json.dump(merged_sorted, f_out, ensure_ascii=False, indent=2)



In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#注意用GPU,不知道为什么不改的话下面的代码都是默认用CPU，CPU还是有点慢,下面的代码有的地方我也稍微改了一下

def load_finbert():
    tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone").to(device) #这里是在用huggingface加载FinBert，如果你是本地环境或者下得慢的话，可以先自己想办法这个模型下载下来
    return tokenizer, model

def get_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs).logits
    scores = softmax(outputs, dim=1).cpu().numpy()[0]
    return scores[2] - scores[0]

#这里在用Finbert给新闻打分，它是每天所有的新闻放在一起算个平均分，花费的时间看你新闻的量

def process_sentiment(news_data, tokenizer, model):
    sentiment_scores = []
    for date, articles in news_data.items():
        print(date, end = "\r")
        scores = []
        for article in articles:
            # text = " ".join([article.get("title", ""), article.get("description", ""), article.get("content", "")])
            text = article.get("title", "")
            score = get_sentiment(text, tokenizer, model)
            scores.append(score)
        sentiment_scores.append({
            'date': date,
            'sentiment_score': np.mean(scores) if scores else 0.0
        }) #如果当天没有新闻就是0分
    return sentiment_scores

with open('gnews_master.json', 'r') as f:
    news_data = json.load(f)

tokenizer, finbert_model = load_finbert()

sentiment_scores = process_sentiment(news_data, tokenizer, finbert_model)

sentiment_df = pd.DataFrame(sentiment_scores)
sentiment_df.to_csv('sentiment_scores_title.csv', index=False)

print("saved: sentiment_scores_title.csv")

In [3]:
!pip install alpha_vantage

Collecting alpha_vantage
  Downloading alpha_vantage-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading alpha_vantage-3.0.0-py3-none-any.whl (35 kB)
Installing collected packages: alpha_vantage
Successfully installed alpha_vantage-3.0.0


In [3]:
from alpha_vantage.timeseries import TimeSeries
import pandas as pd
import os

API_KEY = "SSIG9IPUQPWY332H" #这个API好像没有用量限制

ts = TimeSeries(key=API_KEY, output_format='pandas')


ticker = "NVDA"  #这里是在获取股价信息，AAPL是苹果，去查一下你想做的公司的美股代码
data, meta_data = ts.get_daily(symbol=ticker, outputsize="full")

data.to_csv(f"{ticker}_historical.csv")

print("saved:", data.head())

saved:             1. open   2. high    3. low  4. close    5. volume
date                                                          
2025-04-11  108.500  111.5499  107.4800    110.93  313417265.0
2025-04-10  109.370  110.8600   99.1500    107.57  437812353.0
2025-04-09   98.890  115.1000   97.5301    114.33  612918336.0
2025-04-08  103.805  105.8500   94.4600     96.30  476243392.0
2025-04-07   87.460  101.7500   86.6200     97.64  611041347.0


In [5]:
import pandas as pd

# df = pd.read_csv("AAPL_historical.csv")

# df["date"] = pd.to_datetime(df["date"])

# df = df.sort_values("date")

# df["trend"] = df["4. close"].diff().apply(lambda x: "increase" if x > 0 else ("decrease" if x < 0 else "stable"))

# df[["date", "4. close", "trend"]].to_csv("AAPL_trend.csv", index=False)

#上面这一段是用绝对值的大小判断涨跌，下面这一段会以0.7%为界限判断稳定，应该是下面这种合理，但是测下来这个趋势判断对预测价格的训练没什么用
#里面用到的文件名称自己改

df = pd.read_csv("NVDA_historical.csv")

df["date"] = pd.to_datetime(df["date"])

df = df.sort_values("date")

threshold = 0.007

df["pct_change"] = df["4. close"].pct_change()

df["trend"] = df["pct_change"].apply(
    lambda x: "increase" if x > threshold else ("decrease" if x < -threshold else "stable")
)

df.rename(columns={"4. close": "closingValue"}, inplace=True)
df[["date", "closingValue", "trend"]].to_csv("NVDA_trend2.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import wandb

random_state = 42
np.random.seed(random_state)
random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)

stock_data = pd.read_csv("NVDA_trend2.csv", parse_dates=["date"])
sentiment_data = pd.read_csv("sentiment_scores_title.csv", parse_dates=["date"])
merged_data = pd.merge(stock_data, sentiment_data, on="date", how="left")
merged_data = merged_data.sort_values("date")
merged_data = merged_data.dropna(subset=["closingValue", "sentiment_score"])
merged_data["closingValue"] = merged_data["closingValue"].astype(float)


class StockPriceLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(StockPriceLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
#这里的模型没有写死，是为了下面用wandb来试出一个好的模型

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

X_days = 7
Y_days = 1 #这里的7天是用前7天预测下一天，可以试着改一下看看效果会不会变好
cutoff_date = pd.to_datetime("2025-03-01")#这里的cutoff_date是在切分训练集和测试集，下面进入评估模式那里还有日期，这些日期不要改，我们尽可能预测不同股票的相同的期间
train_data = merged_data[merged_data["date"] < cutoff_date].copy()
closing_values = train_data["closingValue"].values
sentiment_scores = train_data["sentiment_score"].values

features, labels = [], []
for i in range(max(X_days, Y_days), len(train_data)):
    stock_features = closing_values[i - X_days:i]
    sentiment_feature = sentiment_scores[i - Y_days:i]
    if np.any(np.isnan(stock_features)) or np.any(np.isnan(sentiment_feature)) or np.isnan(closing_values[i]):
        continue
    feature_vector = np.concatenate([stock_features, sentiment_feature])
    features.append(feature_vector)
    labels.append(closing_values[i])

features = np.array(features)
labels = np.array(labels)

scaler_X = StandardScaler()
scaler_y = StandardScaler()
features_scaled = scaler_X.fit_transform(features)
labels_scaled = scaler_y.fit_transform(labels.reshape(-1, 1)).flatten()

X_train, X_test, y_train, y_test = train_test_split(
    features_scaled, labels_scaled, test_size=0.2, random_state=random_state
)

X_train_seq = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_seq = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 要用wandb自动调参的话先去注册一个，拿一个api
sweep_config = {
    "method": "bayes",
    "metric": {"name": "test_mse", "goal": "minimize"},
    "parameters": {
        "hidden_size": {"values": [32, 64]},
        "num_layers": {"values": [1, 2, 3]},
        "lr": {"values": [0.001, 0.0005]},
        "num_epochs": {"value": 3000}
    }
}

sweep_id = wandb.sweep(sweep_config, project="NVDA_stock_forecast")#这里改你的名称，下面的链接点进去可以看训练情况


def train():
    wandb.init()
    config = wandb.config
    model = StockPriceLSTM(1, config.hidden_size, config.num_layers).to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

    X_train_dev = X_train_tensor.to(device)
    y_train_dev = y_train_tensor.to(device)
    X_test_dev = X_test_tensor.to(device)
    y_test_dev = y_test_tensor.to(device)

    for epoch in range(config.num_epochs):
        model.train()
        output = model(X_train_dev)
        loss = criterion(output, y_train_dev)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 100 == 0:
            model.eval()
            with torch.no_grad():
                val_output = model(X_test_dev)
                val_loss = criterion(val_output, y_test_dev)
                wandb.log({
                    "epoch": epoch + 1,
                    "train_loss": loss.item(),
                    "test_mse": val_loss.item(),
                    "hidden_size": config.hidden_size,
                    "num_layers": config.num_layers,
                    "lr": config.lr
                })
                print(f"Epoch {epoch+1}: Test MSE = {val_loss.item():.4f}")

    torch.save(model.state_dict(), f"best_model_h{config.hidden_size}_l{config.num_layers}_lr{config.lr}.pth")
    joblib.dump(scaler_X, "scaler_X.save")
    joblib.dump(scaler_y, "scaler_y.save")
    wandb.finish()

wandb.agent(sweep_id, train, count=10)#这里你可以看情况改，5次10次都可以


In [7]:
#这里开始使用刚才跑出来的模型做预测
def predict_future(model_path):
    model = StockPriceLSTM(1, 64, 2).to(device)  # 这里要改成你跑出来的那个模型的结构
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    scaler_X = joblib.load("scaler_X.save")
    scaler_y = joblib.load("scaler_y.save")

    full_data = merged_data.set_index("date")
    predict_dates = pd.date_range(start=pd.to_datetime("2025-03-01"), end=pd.to_datetime("2025-04-08"))# 预测期间就都是这一段吧，大家保持一致

    predictions = []
    for current_date in predict_dates:
        past_window = full_data.loc[:current_date - pd.Timedelta(days=1)].tail(max(X_days, Y_days))
        if len(past_window) < max(X_days, Y_days):
            continue

        closing_seq = past_window["closingValue"].values[-X_days:]
        sentiment_seq = past_window["sentiment_score"].values[-Y_days:]

        if len(closing_seq) < X_days or len(sentiment_seq) < Y_days:
            continue

        input_vector = np.concatenate([closing_seq, sentiment_seq]).reshape(1, -1)
        input_scaled = scaler_X.transform(input_vector).reshape(1, -1, 1)
        input_tensor = torch.tensor(input_scaled, dtype=torch.float32).to(device)

        with torch.no_grad():
            predicted_scaled = model(input_tensor).cpu().item()
            predicted_real = scaler_y.inverse_transform([[predicted_scaled]])[0][0]
            predictions.append((current_date.strftime("%Y-%m-%d"), predicted_real))

    pred_df = pd.DataFrame(predictions, columns=["date", "predicted_closingValue"])
    pred_df.to_csv("future_predicted_NVDA.csv", index=False)
    print("saved: future_predicted_NVDA.csv")

if __name__ == "__main__":
    # 自己改下路径，加载刚才loss最小的模型
    best_model_path = "best_model_h64_l2_lr0.001.pth"
    predict_future(best_model_path)

saved: future_predicted_NVDA.csv


In [8]:
!pip install pyecharts

Collecting pyecharts
  Downloading pyecharts-2.0.8-py3-none-any.whl.metadata (1.6 kB)
Downloading pyecharts-2.0.8-py3-none-any.whl (153 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.7/153.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyecharts
Successfully installed pyecharts-2.0.8


In [9]:
#这里开始做数据可视化
from pyecharts.charts import Line
from pyecharts import options as opts
import pandas as pd

pred_df = pd.read_csv("future_predicted_NVDA.csv", parse_dates=["date"])
real_df = pd.read_csv("NVDA_trend2.csv", parse_dates=["date"])
real_df = real_df[real_df["date"].isin(pred_df["date"])][["date", "closingValue"]]

df = pd.merge(pred_df, real_df, on="date")
df["date"] = df["date"].dt.strftime("%Y-%m-%d")

line = (
    Line()
    .add_xaxis(df["date"].tolist())
    .add_yaxis("真实股价", df["closingValue"].round(2).tolist(), is_smooth=True)
    .add_yaxis("预测股价", df["predicted_closingValue"].round(2).tolist(), is_smooth=True, linestyle_opts=opts.LineStyleOpts(type_="dashed"))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="NVDA 预测"),# 只改这里的和上面读取文件里的股票名字，其他不要动
        tooltip_opts=opts.TooltipOpts(trigger="axis"),
        xaxis_opts=opts.AxisOpts(type_="category", axislabel_opts=opts.LabelOpts(rotate=45)),
        yaxis_opts=opts.AxisOpts(name="收盘价（USD）"),
        datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")],
        legend_opts=opts.LegendOpts(pos_top="5%")
    )
)

line.render("NVDA_prediction_chart.html")
print("已保存 NVDA_prediction_chart.html")


已保存 NVDA_prediction_chart.html
