This repository has been archived by the owner on Feb 19, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nCov_19.py
349 lines (279 loc) · 13.1 KB
/
nCov_19.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# This is a project to predict the tendancy of WuHan 19 nCoVirus
# The 19_nCoV source data are from the project:
# https://github.com/globalcitizen/2019-wuhan-coronavirus-data
import pandas as pd
import os
from os.path import join as opj
import re
import time
import datetime
import json
import numpy as np
import matplotlib.pyplot as plt
import geatpy as ea
from utils import residual_square
import utils
import math
def rough_data_clean():
time_pt = re.compile(r'[0-9]')
dp = './19_nCoV_data/data-sources/dxy/data/'
namelist = os.listdir(dp)
namelist = [name for name in namelist if name[-3:] == 'csv']
for n in namelist:
data_org = opj(dp, n)
save_path = opj('./19_nCoV_clean_data/rough',n[:15]+'.csv')
a = pd.read_csv(data_org, header=None)
# time = time_pt.findall(list(a.iloc[1])[0])
# time_day = '-'.join([''.join(time[:4]),''.join(time[4:6]), ''.join(time[6:8])])
# time_sec = ':'.join([''.join(time[8:10]), ''.join(time[10:12]), ''.join(time[12:])])
# time = time_day + ' ' + time_sec
time = n[:15]
# time = ' '.join(['-'.join([time[0][:4], time[0][4:6], time[0][6:]]),
# ':'.join([time[1][:2], time[1][2:4], time[1][4:]])])
time = datetime.datetime.strptime(time, "%Y%m%d-%H%M%S")
a = a.iloc[3:-1].reset_index(drop=True)
pp = [[], [], []]
for each in a[0]:
pp[0].append(each.split('|')[0])
pp[1].append(int(each.split('|')[1]))
pp[2].append(int(each.split('|')[2]))
pp[0].append('China Total')
pp[1].append(sum(pp[1]))
pp[2].append(sum(pp[2]))
places = pd.Series(pp[0])
confs = pd.Series(pp[1])
deaths = pd.Series(pp[2])
new_df = pd.DataFrame(columns=['place', 'confirmed','deaths'])
new_df['place'] = places
new_df['confirmed'] = confs
new_df['deaths'] = deaths
new_df.to_csv(save_path, header=True, index=False)
## save all the death data
def detail_data_clean(prov_shortname,cityname):
# para: prov_shortname: shortname of the province you are predicting
# para: cityname: shortname of the city you are predicting
dp = './19_nCoV_data/data-sources/dxy/data/'
namelist = os.listdir(dp)
namelist = [name for name in namelist if name[-4:] == 'json']
times, confirmedCount, suspectedCount, curedCount, deadCount = [], [], [], [], []
for n in namelist:
time = n[:15]
time = datetime.datetime.strptime(time, "%Y%m%d-%H%M%S")
times.append(time)
with open(opj(dp, n), 'r', encoding='utf-8') as f:
data = json.load(f)
for eachprov in data:
if eachprov['provinceShortName'] != prov_shortname:
continue
for eachcity in eachprov['cities']:
if eachcity['cityName'] != cityname:
continue
# Place.append(eachprov['cityName'])
confirmedCount.append(eachcity['confirmedCount'])
suspectedCount.append(eachcity['suspectedCount'])
curedCount.append(eachcity['curedCount'])
deadCount.append(eachcity['deadCount'])
city_data = pd.DataFrame(columns=[
'time', 'confirmedCount', 'suspectedCount', 'curedCount','deadCount'])
satistics = [times, confirmedCount, suspectedCount, curedCount, deadCount]
for idx, eachcol in enumerate(list(city_data.columns)):
city_data[eachcol] = pd.Series(satistics[idx])
return city_data
class E_SEIR(object):
"""
build up an E-SEIR model for 19_nCoV plague, help with projects following:
https://github.com/XuelongSun/Dynamic-Model-of-Infectious-Diseases
"""
def __init__(self, T=12):
# population
self.N = 14186500 # data from the website
# simuation Time / Day
self.T = T
# susceptiable ratio
self.s = np.zeros([self.T])
# exposed ratio
self.e = np.zeros([self.T])
# infective ratio
self.i = np.zeros([self.T])
# remove ratio
self.r = np.zeros([self.T])
# contact rate
self.lamda = 0 # = 感染者日均接触人数 * 接触传染概率 / 总人数
self.omiga = 0.1 # effect the lower rate of the lamda
self.d = 20 # deviation of time
# isolate rate 患病的人中被隔离的比率 (zf措施)
self.beta = 0.8
# remove rate # 死亡率+治愈率
self.gamma = 0.3
# exposed period # 平均潜伏期的倒数
self.sigma = 1 / 7
# initial infective people
self.i[0] = 1.0 / self.N
self.s[0] = (1e7 + 4e6) / self.N
self.e[0] = 10.0 / self.N
def deduce(self):
for t in range(self.T-1):
self.lamda = 1 - 1 / (1 + math.exp(-1 * self.omiga * (t-self.d)))
self.s[t + 1] = self.s[t] \
- self.lamda * self.s[t] * (self.i[t] * (1- self.beta) + self.e[t])
self.e[t + 1] = self.e[t] \
+ self.lamda * self.s[t] * (self.i[t] * (1-self.beta) + self.e[t]) \
- self.sigma * self.e[t]
self.i[t + 1] = self.i[t] + self.sigma * self.e[t] - self.gamma * self.i[t]
self.r[t + 1] = self.r[t] + self.gamma * self.i[t]
def draw_curves(self, start_day=0, end_day=1000):
self.s *= self.N
self.e *= self.N
self.i *= self.N
self.r *= self.N
if end_day > self.T:
end_day = self.T
fig, ax = plt.subplots(figsize=(10,6))
# ax.plot(self.s[start_day:end_day], c='b', lw=2, label='S')
ax.plot(self.e[start_day:end_day], c='orange', lw=2, label='E')
ax.plot(self.i[start_day:end_day], c='r', lw=2, label='I')
ax.plot(self.r[start_day:end_day], c='g', lw=2, label='R')
ax.set_xlabel('Day',fontsize=15)
ax.set_ylabel('Number of people', fontsize=15)
ax.grid(1)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend()
plt.show()
def train_process(model, gt):
def aim(Phen):
# ld = Phen[:, [0]] # 取出第1列,得到所有个体的第1个自变量
# gm = Phen[:, [1]] # 取出第2列,得到所有个体的第2个自变量
fits = np.zeros((len(Phen),1)) # 所有个体的健康程度评估
# print(Phen)
for idx in range(len(Phen)):
model.omiga = Phen[idx][1]
model.gamma = Phen[idx][0]
# model.beta = Phen[idx][2]
model.d = Phen[idx][2]
model.deduce()
# print(np.array(model.i * model.N))
exam_day = int(Phen[idx][3])
model_preds_i = model.i[exam_day:exam_day+len(gt)] * model.N
gt_i = np.array(gt['confirmedCount'])
model_preds_r = model.r[exam_day:exam_day+len(gt)] * model.N
gt_r = np.array(gt['deadCount']+gt['curedCount'])
infective_fitness = residual_square(model_preds_i, gt_i)
recov_fitness = residual_square(model_preds_r, gt_r)
fits[idx] = infective_fitness + recov_fitness
# print(model_preds_i, gt_i)
# print(fits)
return fits
# settings
omiga = [0, 1] # 自变量范围
gamma = [0, 1]
# beta = [0, 1]
d = [0, 100]
exam_d = [10, 60]
b1 = [0, 0] # 自变量边界, 1 表示包含边界, 0 表示不包含边界
b2 = [0, 0]
# b3 = [0, 0]
b4 = [0, 0]
b5 = [1, 1]
varTypes = np.array([0, 0, 0, 0]) # 自变量的类型,0表示连续,1表示离散
Encoding = 'BG' # 'BG'表示采用二进制/格雷编码
codes = [1, 1, 1, 1] # 变量的编码方式,2个变量均使用格雷编码
precisions =[6, 6, 6, 6] # 变量的编码精度
scales = [0, 0, 0, 0] # 采用算术刻度
ranges=np.vstack([gamma, omiga, d, exam_d]).T # 生成自变量的范围矩阵
borders=np.vstack([b2, b1, b4, b5]).T # 生成自变量的边界矩阵
# params of GA
NIND = 400 # 种群个体数目
MAXGEN = 100 # 最大遗传代数
maxormins = [1] # 最小化目标函数, 元素为-1表示最大化目标函数
FieldD = ea.crtfld(Encoding,varTypes,
ranges, borders,precisions,codes,scales) # 调用函数创建区域描述器
Lind = int(np.sum(FieldD[0, :])) # 计算编码后的染色体长度
obj_trace = np.zeros((MAXGEN, 2)) # 定义目标函数值记录器
var_trace = np.zeros((MAXGEN, Lind)) # 定义染色体记录器,记录每一代最优个体的染色体
start_time = time.time() # 开始计时
Chrom = ea.crtbp(NIND, Lind) # 生成种群染色体矩阵
variable = ea.bs2real(Chrom, FieldD) # 对初始种群进行解码
ObjV = aim(variable) # 计算初始种群个体的目标函数值
best_ind = np.argmin(ObjV) # 计算当代最优个体的序号
# 开始进化
for gen in range(MAXGEN):
FitnV = ea.ranking(maxormins * ObjV) # 根据目标函数大小分配适应度值(由于遵循目标最小化约定,因此最大化问题要对目标函数值乘上-1)
SelCh=Chrom[ea.selecting('rws', FitnV, NIND-1), :] # 选择,采用'rws'轮盘赌选择
SelCh=ea.recombin('xovsp', SelCh, 0.7) # 重组(采用两点交叉方式,交叉概率为0.7)
SelCh=ea.mutbin(Encoding, SelCh) # 二进制种群变异
# 把父代精英个体与子代合并
Chrom = np.vstack([Chrom[best_ind, :], SelCh])
variables = ea.bs2real(Chrom, FieldD) # 对育种种群进行解码(二进制转十进制)
ObjV = aim(variables) # 求育种个体的目标函数值
# 记录
best_ind = np.argmin(ObjV) # 计算当代最优个体的序号
obj_trace[gen, 0] = np.sum(ObjV) / NIND # 记录当代种群的目标函数均值
obj_trace[gen, 1] = ObjV[best_ind] # 记录当代种群最优个体目标函数值
var_trace[gen, :] = Chrom[best_ind, :] # 记录当代种群最优个体的变量值
# 进化完成
end_time = time.time() # 结束计时
# 绘制图像
ea.trcplot(obj_trace, [['种群个体平均目标函数值', '种群最优个体目标函数值']])
best_gen = np.argmin(obj_trace[:, [1]])
print('最优解的目标函数值: ', obj_trace[best_gen, 1])
opt_variable = ea.bs2real(var_trace[[best_gen], :], FieldD)
print('最优解的决策变量为:')
for i in range(opt_variable.shape[1]):
print(opt_variable[0, i])
print('用时: ', end_time - start_time, '秒')
model.omiga = opt_variable[0, 1]
model.gamma = opt_variable[0, 0]
# model.beta = opt_variable[0, 2]
model.d = opt_variable[0, 2]
model.deduce()
model.draw_curves()
return opt_variable[0]
def get_evaluation(model, fit_data, pre_gt, T=150):
assert T >= 2 * len(fit_data+pre_gt), 'Should set a more long period'
opt_vars = train_process(model, fit_data) # [gamma, omiga, d, exam_day]
test_model = E_SEIR(T=T)
test_model.gamma = opt_vars[0]
test_model.omiga = opt_vars[1]
test_model.d = opt_vars[2]
exam_day = opt_vars[3]
exam_day = int(exam_day)+len(fit_data)
test_model.deduce()
test_model.draw_curves()
pre_model_i = test_model.i[exam_day: exam_day+len(pre_gt)]
pre_model_r = test_model.r[exam_day: exam_day+len(pre_gt)]
pre_gt_i = np.array(pre_gt['confirmedCount'])
pre_gt_r = np.array(pre_gt['deadCount']+ pre_gt['curedCount'])
i_error = utils.related_error_rate(pre_model_i, pre_gt_i)
r_error = utils.related_error_rate(pre_model_r, pre_gt_r)
print('误差率--- 感染者误差率:{}, 移除者误差率:{}'.format(i_error, r_error))
print(pre_gt_i, pre_model_i)
def get_R0(t, Yt, Tg, Ti):
# t: 初始病发至今,某一时刻
# Yt:t时刻对应病患数
# Tg: 潜伏期时长
# Ti: 感染期时长
lamda = math.log(Yt) / t
r0 = (1 + Tg * lamda) * (1 + Ti * lamda)
return r0
if __name__ == '__main__':
data = detail_data_clean('湖北','武汉')
# only save the data on day
data['time'] = data['time'].dt.date
data_day = data.groupby(by=['time']).head(1).reset_index(drop=True)
# confirmed count = accumulated - removed
data_day['confirmedCount'] = data_day['confirmedCount'] \
- (data_day['curedCount']+data_day['deadCount'])
# 提前n天划分数据集
n = 7
train_day = data_day.iloc[:-n]
test_day = data_day.iloc[-n:]
print(train_day)
model = E_SEIR(T=150)
print('划分数据集 训练天数和测试天数分别为:',train_day.shape[0], test_day.shape[0])
get_evaluation(model, train_day, test_day)
# model.deduce()
# fitness = residual_square(np.array(model.i), np.array(data_day['confirmedCount']))
# model.draw_curves()
R0 = get_R0(40, 572, 7, 1)
print('基本再生指数:', R0)