In [188]:
import pandas as pd
import tqdm
import datetime

from langchain_core import outputs

df = pd.read_csv("./daily/stock_2024-12-23.csv")
cdf = pd.read_csv("stock_today.csv")
df = pd.concat([df,cdf])
df.dropna(inplace=True)
df.head()

Unnamed: 0,date,open,high,low,close,volume,amount,code,name
0,2024-11-18,27.99,28.25,27.15,27.32,3599179.0,99040517.0,sz301071,力量钻石
1,2024-11-19,27.35,28.06,27.17,28.05,2994500.0,82818869.0,sz301071,力量钻石
2,2024-11-20,27.92,28.31,27.8,28.2,2931460.0,82266534.0,sz301071,力量钻石
3,2024-11-21,28.18,28.33,27.5,27.82,3080719.0,86109404.0,sz301071,力量钻石
4,2024-11-22,27.8,27.95,26.66,26.66,3874720.0,105882085.0,sz301071,力量钻石


In [189]:
from sklearn.preprocessing import  KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=10,strategy='kmeans',subsample=100000)
kbins.fit(df["volume"].dropna().values.reshape(-1,1))
bin_edges = kbins.bin_edges_[0]


class ChangeVocab(object):

    def __init__(self,gap=20,buff=0.1):
        super().__init__()
        self.gap = gap
        self.buff = buff
        self.dim = 2*self.gap+1

    def __call__(self,v):
        if v<-self.gap or abs(v+self.gap)<self.buff:
            return 0
        elif v>self.gap or abs(v-self.gap)<self.buff:
            return self.gap+self.gap
        else:
            return int(round(v))+self.gap

class AmountVocab(object):
    def __init__(self,bin_edges=bin_edges):

        super().__init__()
        self.bin_edges = bin_edges
        self.dim = len(self.bin_edges)+1

    def __call__(self,v):
        for idx, e in enumerate(self.bin_edges):
            if v<e:
                return  idx
        return self.dim-1
cdate = "2024-12-24"
avocab = AmountVocab()
cvocab,pvocab = ChangeVocab(gap=5),ChangeVocab(gap=10)
import numpy as np
codes, names, targets, features = [], [], [],[]
for k,g in tqdm.tqdm(df.sort_values(by="date", ascending=False).groupby("code"),desc=f"processing date[{cdate}]"):
    cur = g[g.date==cdate]
    g = g[g.date<cdate]
    cg, pg = g[:10],g[10:]
    row = np.zeros((10,6),dtype=int)
    if cg.shape[0]==10 and pg.shape[0]>0 and cur.shape[0]>0:
        amount = pg["amount"].mean()
        cr,pr = cur.iloc[0],cg.iloc[0]
        codes.append(cr["code"])
        names.append(cr["name"])
        change = (cr.close-pr.close)*100/pr.close
        targets.append(change)
        for i in range(cg.shape[0]):
            r, p= g.iloc[i],g.iloc[i+1]
            close = pvocab((r.close - p.close)*100/p.close)
            _open = pvocab((r.open - p.close)*100/p.close)
            high = pvocab((r.high - p.close)*100/p.close)
            low = pvocab((r.low - p.close)*100/p.close)
            volume = cvocab(np.log2(r.amount/amount))
            amount = avocab(r.amount)
            row[i,:] = [close,_open,high,low,volume,amount]
        features.append(row)
fdf = pd.DataFrame({"code":codes,"name":names,"feature":features, "target": targets})

processing date[2024-12-24]: 100%|██████████| 4536/4536 [00:08<00:00, 544.05it/s]


In [190]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


class StockDataset(Dataset):
    def __init__(self, X,y = None):
        self.X = X
        self.y = y
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return (torch.tensor(self.X.iloc[idx]["feature"], dtype=int),
                torch.tensor(self.y.iloc[idx],dtype=torch.float32))


pdf =fdf.copy()
y = pdf.pop("target")
X_train,X_test,y_train,y_test = train_test_split(pdf,y,test_size=0.1)
trainset = StockDataset(X_train,y_train)
testset = StockDataset(X_test,y_test)
trainloader = DataLoader(trainset,batch_size=32,shuffle=True)
testloader = DataLoader(testset,batch_size=32,shuffle=False)

In [191]:
from torch.nn import MSELoss
from torch.optim import Adam, SGD
from torch import nn
import torch

class DailyLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.pembedding = nn.Embedding(pvocab.dim, 100)
        self.cembedding = nn.Embedding(cvocab.dim, 100)
        self.aembedding = nn.Embedding(avocab.dim, 100)
        self.rnn = nn.GRU(100*6, 100,num_layers=2,batch_first=True, bidirectional=False)
        self.linear = nn.Linear(100, 1)
    def forward(self,x):
        bs,length  = x.shape[0], x.shape[1]
        pe = self.pembedding(x[:,:,:4])
        ce = self.cembedding(x[:,:,4:5])
        ae = self.aembedding(x[:,:,5:])
        embedding = torch.cat((pe,ce,ae),dim=2)
        x = embedding.view((bs,length,-1))
        hx,y = self.rnn(x)
        y = y[-1,:,:].view((bs,-1))
        ouputs = self.linear(y)
        ouputs = torch.tanh(ouputs)
        return ouputs*10
    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                m.bias.data.zero_()
            elif isinstance(m, (nn.LSTM, nn.GRU)):
                nn.init.xavier_uniform_(m.weight)


model = DailyLSTM()
optimizer = Adam(model.parameters(),lr=0.001)
# optimizer = SGD(model.parameters(),lr=0.001)
loss_fn = MSELoss()
for epoch in range(10):
    for i, batch in enumerate(trainloader):
        optimizer.zero_grad()
        X, y = batch
        output = model(X)
        loss = loss_fn(output,y.view(-1,1))
        loss.backward()
        if i % 20 == 0:
            print(f"Epoch:{epoch+1}, batch:{i}", loss.item())


Epoch:1, batch:0 10.595415115356445
Epoch:1, batch:20 11.482601165771484
Epoch:1, batch:40 12.196684837341309
Epoch:1, batch:60 8.124990463256836
Epoch:1, batch:80 15.814774513244629
Epoch:1, batch:100 16.822551727294922
Epoch:1, batch:120 8.96056842803955
Epoch:2, batch:0 12.25876235961914
Epoch:2, batch:20 9.78048038482666
Epoch:2, batch:40 6.146042823791504
Epoch:2, batch:60 7.314659595489502
Epoch:2, batch:80 9.381875038146973
Epoch:2, batch:100 15.070804595947266
Epoch:2, batch:120 11.968202590942383
Epoch:3, batch:0 17.13756561279297
Epoch:3, batch:20 15.020370483398438
Epoch:3, batch:40 8.805839538574219
Epoch:3, batch:60 15.46458911895752
Epoch:3, batch:80 16.390592575073242
Epoch:3, batch:100 13.846244812011719
Epoch:3, batch:120 12.758565902709961
Epoch:4, batch:0 11.666915893554688
Epoch:4, batch:20 12.09241008758545
Epoch:4, batch:40 8.289555549621582
Epoch:4, batch:60 11.251978874206543
Epoch:4, batch:80 13.68069839477539
Epoch:4, batch:100 9.793963432312012
Epoch:4, batch

In [195]:
model.eval()
lst, targets = [],[]
for i, batch in enumerate(testloader):
    X, y = batch
    targets.append(y)
    output = model(X)
    lst.append(output)
preds = torch.concat(lst)
targets = torch.concat(targets).view(-1,1)
loss_fn(preds,targets)

X_test["prediction"] = preds.detach().numpy()
X_test["target"] = targets.detach().numpy()
X_test.sort_values(by="prediction",ascending=False)[:100]

Unnamed: 0,code,name,feature,prediction,target
1697,sz000099,中信海直,"[[7, 10, 12, 7, 3, 6], [12, 10, 13, 10, 10, 6], [9, 8, 10, 8, 10, 5], [11, 11, 12, 10, 10, 5], [6, 9, 10, 5, 10, 6], [9, 9, 12, 8, 10, 7], [7, 9, 10, 7, 10, 7], [11, 9, 12, 8, 10, 7], [11, 9, 12, 8, 10, 7], [12, 14, 15, 12, 10, 7]]",3.483998,2.753392
2691,sz002622,皓宸医疗,"[[0, 9, 9, 0, 5, 3], [14, 10, 17, 9, 10, 3], [10, 10, 12, 8, 10, 3], [9, 10, 12, 8, 10, 3], [4, 10, 10, 3, 10, 3], [8, 9, 11, 7, 10, 3], [8, 9, 13, 8, 10, 4], [11, 10, 12, 7, 10, 4], [11, 9, 12, 6, 10, 4], [7, 11, 19, 5, 10, 5]]",2.179329,-1.618123
963,sh603005,晶方科技,"[[10, 10, 11, 7, 5, 7], [14, 10, 15, 9, 10, 8], [12, 8, 13, 7, 10, 7], [13, 10, 14, 9, 10, 7], [7, 9, 12, 7, 10, 6], [7, 10, 10, 6, 10, 6], [12, 10, 15, 8, 10, 7], [10, 10, 10, 8, 10, 6], [11, 10, 12, 9, 10, 6], [10, 14, 14, 10, 10, 7]]",1.912951,1.472099
4067,sz301091,深城交,"[[7, 10, 12, 7, 4, 5], [12, 10, 13, 10, 10, 5], [9, 9, 11, 8, 10, 4], [9, 11, 11, 9, 10, 4], [5, 9, 11, 5, 10, 5], [10, 8, 13, 6, 10, 6], [11, 9, 20, 6, 10, 7], [12, 9, 12, 7, 10, 6], [9, 9, 11, 8, 10, 5], [14, 15, 16, 12, 10, 6]]",1.830304,2.394958
1542,sh605018,长华集团,"[[3, 10, 11, 2, 5, 2], [11, 10, 12, 10, 10, 2], [9, 10, 11, 7, 10, 2], [8, 9, 13, 6, 10, 3], [4, 9, 11, 2, 10, 3], [10, 13, 13, 6, 10, 4], [12, 8, 14, 8, 10, 4], [9, 9, 13, 5, 10, 4], [13, 9, 17, 8, 10, 4], [14, 11, 16, 8, 10, 4]]",1.733886,0.658617
...,...,...,...,...,...
52,sh600073,光明肉业,"[[7, 10, 10, 7, 5, 3], [11, 10, 12, 10, 10, 3], [9, 9, 10, 8, 10, 3], [9, 10, 11, 9, 10, 3], [7, 9, 10, 7, 10, 3], [9, 10, 10, 8, 10, 3], [8, 9, 9, 7, 10, 4], [13, 10, 13, 9, 10, 4], [14, 10, 14, 10, 10, 4], [11, 12, 13, 11, 10, 3]]",-0.228242,2.167630
277,sh600380,健康元,"[[9, 10, 11, 9, 4, 3], [10, 10, 11, 10, 10, 3], [10, 9, 10, 8, 10, 3], [10, 10, 11, 10, 10, 3], [9, 10, 10, 9, 10, 3], [9, 10, 11, 9, 10, 3], [8, 10, 10, 8, 10, 4], [11, 10, 11, 10, 10, 4], [10, 10, 11, 10, 10, 3], [10, 12, 12, 10, 10, 4]]",-0.233772,1.878354
1634,sh605499,东鹏饮料,"[[10, 9, 11, 9, 4, 4], [11, 10, 13, 10, 10, 5], [7, 10, 10, 7, 10, 5], [8, 8, 11, 7, 10, 5], [11, 9, 11, 8, 10, 5], [10, 11, 12, 7, 10, 6], [12, 9, 15, 9, 10, 8], [12, 10, 13, 9, 10, 6], [18, 9, 19, 9, 10, 7], [11, 11, 13, 10, 10, 5]]",-0.237293,0.053721
1848,sz000682,东方电子,"[[9, 10, 12, 9, 4, 3], [9, 10, 10, 8, 10, 3], [11, 10, 11, 8, 10, 3], [10, 10, 11, 9, 10, 3], [9, 10, 11, 9, 10, 3], [10, 10, 11, 9, 10, 3], [8, 10, 11, 8, 10, 3], [10, 10, 10, 9, 10, 3], [11, 10, 12, 10, 10, 4], [10, 11, 12, 9, 10, 4]]",-0.237491,1.623687
