## Import pacages

In [1]:
import os
import random

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

## Torch Model

In [2]:
class LinearRegression(torch.nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear = torch.nn.Linear(1, 1)

    def forward(self, x):
        x = self.linear(x**4)
        return x
    
    
class Model():
    def __init__(self, model, weights, bias):
        self.model = model
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=5e-2)
        self.criterion = torch.nn.MSELoss()
        self.train_loss = []
        self.pred_loss = []
        
        with torch.no_grad():
            model.linear.weight.copy_(weights)
            model.linear.bias.copy_(bias)
        print(model)

    def trainAnEpoch(self, inputs, answers):
        self.optimizer.zero_grad()
        outputs = self.model(inputs)
        loss = self.criterion(outputs, answers)
        loss.backward()
        self.optimizer.step()
        self.train_loss.append(float(loss))
        
    def predict(self, inputs):
        return self.model(inputs)
    

## Manual model

In [3]:
class linear_regression():
    def __init__(self, weights, bias):
        self.w = weights
        self.b = bias
        self.a = 1e-3
        self.train_loss = []
        self.pred_loss = []
    
    def predict(self, x):
        return self.w*x**4 + self.b
    
    def loss_func(self, pred_y, true_y):
        return ((true_y - pred_y)**2).mean()
    
    def grad(self, x, y):
        grad_w = -2*x**4*(y - self.w*x**4 - self.b)
        grad_b = -2*(y - self.w*x**4 - self.b)
        self.w = self.w - self.a*grad_w
        self.b = self.b - self.a*grad_b
        
    def train_an_epoch(self, inputs, answers):
        for x, y in zip(inputs, answers):
            self.grad(x, y)
            pred_ans = self.predict(inputs)
        loss = self.loss_func(pred_ans, answers)
        self.train_loss.append(loss)
        

## Load data

In [4]:
def loadData():
    dataSet = {}
    validPoints = 0
    totalPoints = 0
    print("Detected stocks: " + str(len(os.listdir("./data/stocks"))))
    for fileName in os.listdir("./data/stocks"):
        dataSet[fileName] = pd.read_csv("./data/stocks/" + fileName,
                                        sep=",",
                                        header=0,
                                        usecols=range(3,22),
                                        encoding="gbk",
                                        dtype="float64"
                                        ).dropna()
        totalPoints += dataSet[fileName].shape[0]
#         for i in range(1, dataSet[fileName].shape[1]):
#             dataSet[fileName].iloc[:, i] = dataSet[fileName].iloc[:, i] - dataSet[fileName].iloc[:, i].min()
#             dataSet[fileName].iloc[:, i] = dataSet[fileName].iloc[:, i] / dataSet[fileName].iloc[:, i].max()
        if dataSet[fileName].empty:
            # print("Warning: problematic stock dataset " + fileName)
            del dataSet[fileName]
        else:
            
            validPoints += dataSet[fileName].shape[0]
    print("Total data points: " + str(totalPoints))
    print("Valid data points: " + str(validPoints))
    print("Valid stocks: " + str(len(dataSet.keys())))

    return dataSet

## Influence function

In [5]:
def InfFunc(w, b, x, y, xTest, yTest):
    L = (w*x**4+b-y)**2
    dLossOverdW = 2*(w*xTest**4 + b - yTest)
    Hw = 2*(x**4)*(x**4 + b - y)
    dLossOverdWdX = 2*(4*w*x**3 + b - y)*(x**4) + 2*(w*x**4 + b - y)*(4*x**3)
    return float(-dLossOverdW*dLossOverdWdX/Hw)

## 1. Load Data

In [30]:
dataSet = loadData()

Detected stocks: 4702
Total data points: 35232
Valid data points: 35232
Valid stocks: 4690


In [33]:
for rowId in range(6, stock.shape[0]):
        print(np.asarray(stock.iloc[rowId-6:rowId-1,:]))

[[ 2.20300000e+01 -5.00000000e-02  0.00000000e+00  2.23000000e+01
   2.20800000e+01  2.23000000e+01  2.18800000e+01  7.87260000e+02
   1.74109143e+06  2.13046421e-03  2.47667285e-01  3.03540337e-01
   1.90217391e-02  1.89913793e+01  8.14063795e+08  1.07065800e+09
   1.16000000e+00  1.78076966e+07]
 [ 2.16500000e+01  1.40000000e-01  0.00000000e+00  2.15300000e+01
   2.15100000e+01  2.19100000e+01  2.13200000e+01  1.43364000e+03
   3.09347740e+06  3.87968233e-03  5.22972426e-01 -6.00000000e-01
   2.74291027e-02  1.86637931e+01  8.00021842e+08  1.05219000e+09
   1.16000000e+00  1.78076966e+07]
 [ 2.17200000e+01  2.00000000e-02  0.00000000e+00  2.15400000e+01
   2.17000000e+01  2.18700000e+01  2.15400000e+01  7.00780000e+02
   1.52476560e+06  1.89643410e-03  5.39987825e-01 -1.34199134e-01
   1.52073733e-02  1.87241379e+01  8.02608517e+08  1.05559200e+09
   1.16000000e+00  1.78076966e+07]
 [ 2.12000000e+01 -5.50000000e-01  0.00000000e+00  2.17500000e+01
   2.17500000e+01  2.18100000e+01  2.

In [34]:
for rowId in range(6, stock.shape[0]):
        print(np.delete(np.asarray(stock.iloc[rowId-6:rowId-1,:]), 1, 1))

[[ 2.20300000e+01 -5.00000000e-02  0.00000000e+00  2.23000000e+01
   2.20800000e+01  2.23000000e+01  2.18800000e+01  7.87260000e+02
   1.74109143e+06  2.13046421e-03  2.47667285e-01  3.03540337e-01
   1.90217391e-02  1.89913793e+01  8.14063795e+08  1.07065800e+09
   1.16000000e+00  1.78076966e+07]
 [ 2.16500000e+01  1.40000000e-01  0.00000000e+00  2.15300000e+01
   2.15100000e+01  2.19100000e+01  2.13200000e+01  1.43364000e+03
   3.09347740e+06  3.87968233e-03  5.22972426e-01 -6.00000000e-01
   2.74291027e-02  1.86637931e+01  8.00021842e+08  1.05219000e+09
   1.16000000e+00  1.78076966e+07]
 [ 2.17200000e+01  2.00000000e-02  0.00000000e+00  2.15400000e+01
   2.17000000e+01  2.18700000e+01  2.15400000e+01  7.00780000e+02
   1.52476560e+06  1.89643410e-03  5.39987825e-01 -1.34199134e-01
   1.52073733e-02  1.87241379e+01  8.02608517e+08  1.05559200e+09
   1.16000000e+00  1.78076966e+07]
 [ 2.12000000e+01 -5.50000000e-01  0.00000000e+00  2.17500000e+01
   2.17500000e+01  2.18100000e+01  2.

In [35]:
inputs = []
answers = []

for stock in dataSet.values():
    if stock.shape[0] <= 6:
        continue
    stockInputs = []
    stockTargets = []
    for rowId in range(6, stock.shape[0]):
        stockInput = np.delete(np.asarray(stock.iloc[rowId-6:rowId-1,:]), 1, 1)
        stockInputs.append(torch.tensor(stockInput))
        stockTargets.append(torch.tensor(stock.iloc[rowId, 1]))
    inputs = inputs + stockInputs
    answers = answers + stockTargets
print(len(inputs))
print(len(answers))

7329
7329


In [36]:
inputs[0].shape

torch.Size([5, 19])

In [37]:
answers[0]

tensor(0.0012, dtype=torch.float64)

## 2. Analyze the data relationship

In [38]:
analyzeData = []
for inputPoint in inputs:
    analyzePoint = [inputPoint[:, columnId].mean() for columnId in range(inputPoint.shape[1])]
    analyzeData.append(torch.stack(analyzePoint))
analyzeData = torch.stack(analyzeData)
analyzeData = torch.cat((analyzeData[:, 0].reshape(-1, 1), torch.stack(answers).reshape(-1, 1), analyzeData[:, 1:]), dim=1)

### 2.1 correlation coefficient ranking

In [39]:
corXtoY = [(i, float(num)) for num, i in zip(torch.corrcoef(analyzeData.T)[1].abs(), range(19))]
corXtoY.sort(key=lambda x: x[1], reverse=True)

In [40]:
for i in corXtoY:
    print(i, dataSet["0600000.csv"].columns[i[0]])

(1, 1.0) 涨跌幅
(14, 0.09663154038446149) 市盈率
(11, 0.06530880666267473) 量比
(13, 0.0545571093498036) 振幅
(9, 0.053494021163484874) 成交额
(10, 0.049688644811378246) 换手率
(17, 0.04909941383828275) 每股收益
(3, 0.04747274617390823) 5分钟涨
(16, 0.04554364245846945) 总市值
(4, 0.02905291450468196) 今开
(12, 0.014239141431938666) 委比
(18, 0.010443191959068975) 净利润
(15, 0.00713626621151309) 流通市值
(8, 0.004519691668115285) 成交量
(7, 0.003972575992256286) 最低
(5, 0.0039594998478338144) 昨收
(0, 0.003720907991651575) 价格
(6, 0.0031436537773441046) 最高
(2, 0.0027407813950708106) 涨跌额


### 2.2 P-value

In [41]:
for columnId in range(analyzeData.shape[1]):
    print(columnId, ttest_ind(analyzeData[:, columnId]**2, analyzeData[:, 1])[1])

0 9.760317284140269e-08
1 1.7350327717497267e-64
2 1.449475009015624e-77
3 0.00011903829326757542
4 3.6142873971144363e-81
5 6.189688382010474e-08
6 5.4930087046329136e-08
7 6.311000710436078e-08
8 1.0688702291407458e-07
9 1.265400454455798e-11
10 8.580018505345825e-26
11 2.377216362924443e-51
12 0.0
13 0.0
14 5.736595567338387e-41
15 0.061282902757509236
16 0.00017899914973459796
17 1.0221006106334856e-06
18 7.880115238913953e-11
19 0.0003189494189550446


In [42]:
for columnId in range(analyzeData.shape[1]):
    print(columnId, ttest_ind(analyzeData[:, columnId], analyzeData[:, 1])[1])

0 0.0
1 1.0
2 4.884795582429762e-52
3 0.5624100077680763
4 3.642938480994255e-77
5 0.0
6 0.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
12 0.0
13 6.238680133785559e-26
14 0.0
15 0.10821889303826714
16 1.0146069964565168e-120
17 1.875673609166355e-126
18 5.390490436617155e-265
19 1.8538068437028882e-26


## 3. Split dataset into train and test

In [15]:
testIndexs = random.sample(range(0, len(answers) - 1), 2000)
train = []
trainAns = []
test = []
testAns = []
for i in range(len(answers)):
    if i not in testIndexs:
        train.append(inputs[i][:, 11].mean())
        trainAns.append(answers[i])
    else:
        test.append(inputs[i][:, 11].mean())
        testAns.append(answers[i])
        
train = torch.stack(train).reshape(-1, 1).float()
trainAns = torch.stack(trainAns).reshape(-1, 1).float()
test = torch.stack(test).reshape(-1, 1).float()
testAns = torch.stack(testAns).reshape(-1, 1).float()

In [16]:
trainAns.shape

torch.Size([5329, 1])

In [17]:
weights = torch.tensor(random.random())
bias = torch.tensor(random.random())

In [18]:
weights, bias

(tensor(0.4494), tensor(0.1019))

### 3.1.1 pytorch model

In [19]:
model = Model(LinearRegression(), weights, bias)

LinearRegression(
  (linear): Linear(in_features=1, out_features=1, bias=True)
)


In [20]:
for epoch in range(1000):
    model.trainAnEpoch(train, trainAns)
    test_pred = model.predict(test)
    model.pred_loss.append(float(model.criterion(testAns, test_pred)))
    print("Epoch {}, loss {}".format(epoch, model.train_loss[-1]))

Epoch 0, loss 0.01028471626341343
Epoch 1, loss 0.008448484353721142
Epoch 2, loss 0.006961140315979719
Epoch 3, loss 0.005756395868957043
Epoch 4, loss 0.0047805532813072205
Epoch 5, loss 0.003990123048424721
Epoch 6, loss 0.0033498757984489202
Epoch 7, loss 0.0028312760405242443
Epoch 8, loss 0.002411210909485817
Epoch 9, loss 0.0020709580276161432
Epoch 10, loss 0.0017953530186787248
Epoch 11, loss 0.001572112669236958
Epoch 12, loss 0.001391287543810904
Epoch 13, loss 0.0012448186753317714
Epoch 14, loss 0.0011261781910434365
Epoch 15, loss 0.001030078623443842
Epoch 16, loss 0.0009522373438812792
Epoch 17, loss 0.000889185001142323
Epoch 18, loss 0.0008381116203963757
Epoch 19, loss 0.0007967412821017206
Epoch 20, loss 0.0007632304332219064
Epoch 21, loss 0.0007360856398008764
Epoch 22, loss 0.0007140974048525095
Epoch 23, loss 0.0006962858606129885
Epoch 24, loss 0.0006818576948717237
Epoch 25, loss 0.0006701696547679603
Epoch 26, loss 0.000660701422020793
Epoch 27, loss 0.000653

Epoch 270, loss 0.0006190136773511767
Epoch 271, loss 0.00061900814762339
Epoch 272, loss 0.0006190026761032641
Epoch 273, loss 0.0006189972627907991
Epoch 274, loss 0.0006189917330630124
Epoch 275, loss 0.0006189863779582083
Epoch 276, loss 0.0006189809064380825
Epoch 277, loss 0.0006189754349179566
Epoch 278, loss 0.0006189699051901698
Epoch 279, loss 0.0006189644918777049
Epoch 280, loss 0.0006189590785652399
Epoch 281, loss 0.000618953665252775
Epoch 282, loss 0.0006189480773173273
Epoch 283, loss 0.0006189426640048623
Epoch 284, loss 0.0006189372506923974
Epoch 285, loss 0.0006189317791722715
Epoch 286, loss 0.0006189263076521456
Epoch 287, loss 0.0006189208361320198
Epoch 288, loss 0.0006189154810272157
Epoch 289, loss 0.0006189099512994289
Epoch 290, loss 0.000618904537986964
Epoch 291, loss 0.0006188990664668381
Epoch 292, loss 0.0006188936531543732
Epoch 293, loss 0.0006188881816342473
Epoch 294, loss 0.0006188827683217824
Epoch 295, loss 0.0006188772968016565
Epoch 296, loss 

Epoch 640, loss 0.0006170252454467118
Epoch 641, loss 0.0006170198321342468
Epoch 642, loss 0.0006170145934447646
Epoch 643, loss 0.0006170093547552824
Epoch 644, loss 0.0006170039996504784
Epoch 645, loss 0.0006169987609609962
Epoch 646, loss 0.000616993464063853
Epoch 647, loss 0.0006169882253743708
Epoch 648, loss 0.0006169829866848886
Epoch 649, loss 0.0006169776897877455
Epoch 650, loss 0.0006169723346829414
Epoch 651, loss 0.0006169670959934592
Epoch 652, loss 0.0006169617990963161
Epoch 653, loss 0.000616956502199173
Epoch 654, loss 0.0006169513217173517
Epoch 655, loss 0.0006169459666125476
Epoch 656, loss 0.0006169406697154045
Epoch 657, loss 0.0006169353728182614
Epoch 658, loss 0.0006169301923364401
Epoch 659, loss 0.000616924895439297
Epoch 660, loss 0.0006169195985421538
Epoch 661, loss 0.0006169143598526716
Epoch 662, loss 0.0006169090629555285
Epoch 663, loss 0.0006169037660583854
Epoch 664, loss 0.0006168984691612422
Epoch 665, loss 0.0006168931722640991
Epoch 666, loss

In [21]:
print(model.model.linear.weight)
print(model.model.linear.bias)

Parameter containing:
tensor([[0.4327]], requires_grad=True)
Parameter containing:
tensor([0.0037], requires_grad=True)


### 3.1.2 Calculate influence

In [22]:
w = float(list(model.model.parameters())[0])
b = float(list(model.model.parameters())[1])

In [23]:
influence = []
i = 1
for x, y in zip(train, trainAns):
    pointInfluence = []
    print(i)
    for xTest, yTest in zip(test, testAns):
        pointInfluence.append(InfFunc(w, b, x, y, xTest, yTest))
    influence.append(pointInfluence)
    i += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


KeyboardInterrupt: 

### 3.2 manually model

In [None]:
weights, bias

In [None]:
lr_model = linear_regression(weights, bias)

In [None]:
for epoch in range(1000):
    lr_model.train_an_epoch(train, trainAns)
    lr_model.pred_loss.append(lr_model.loss_func(lr_model.predict(test), testAns))
    print("Epoch {}, loss {}".format(epoch, lr_model.train_loss[-1]))

In [None]:
print(lr_model.w)
print(lr_model.b)

## 4. Visualization loss

#### 4.1 torch model

In [None]:
train_loss = model.train_loss
pred_loss = model.pred_loss

In [None]:
plt.plot(train_loss)
plt.ylabel("train loss")
plt.xlabel("points")
plt.show()

In [None]:
plt.plot(pred_loss, color="red")
plt.ylabel("pred loss")
plt.xlabel("points")
plt.show()

#### 4.1 manual model

In [None]:
train_loss = lr_model.train_loss
pred_loss = lr_model.pred_loss

In [None]:
plt.plot(train_loss)
plt.ylabel("train loss")
plt.xlabel("points")
plt.show()

In [None]:
plt.plot(pred_loss, color="red")
plt.ylabel("pred loss")
plt.xlabel("points")
plt.show()

## 5. test

### 5.1 Sampling

##### 5.1.1 torch model

In [None]:
predict = model.predict(test)
plt.plot(torch.detach(predict[:300]), color="blue", linestyle='dashed', label="predict")
plt.plot(testAns[:300], color="red", label="answer")
plt.ylabel("Ups and downs")
plt.xlabel("points")
plt.title("Pred & true-value")
plt.show()

In [None]:
plt.plot(torch.detach(testAns-predict)[:300], color="green")
plt.title("Error fluctuation")

In [None]:
print("std testAns: {}".format(testAns.std()))

In [None]:
print("std predict: {}".format(predict.std()))

In [None]:
print("std diff: {}".format(torch.abs(testAns-predict).std()))

In [None]:
print("max diff: {}".format(torch.abs(testAns-predict).max()))

In [None]:
print("ave diff: {}".format(torch.abs(testAns-predict).mean()))

##### 5.1.2 manual model

In [None]:
predict = lr_model.predict(test)
plt.plot(torch.detach(predict[:300]), color="blue", linestyle='dashed', label="predict")
plt.plot(testAns[:300], color="red", label="answer")
plt.ylabel("Ups and downs")
plt.xlabel("points")
plt.title("Pred & true-value")
plt.show()

In [None]:
plt.plot(torch.detach(testAns-predict)[:300], color="green")
plt.title("Error fluctuation")

In [None]:
print("std testAns: {}".format(testAns.std()))

In [None]:
print("std predict: {}".format(predict.std()))

In [None]:
print("std diff: {}".format(torch.abs(testAns-predict).std()))

In [None]:
print("max diff: {}".format(torch.abs(testAns-predict).max()))

In [None]:
print("ave diff: {}".format(torch.abs(testAns-predict).mean()))

In [None]:
totalInfluence = [sum(pointInfluence) for pointInfluence in influence]

In [None]:
totalInfluence