# PaperQA实现代码
### 整体思路：

1.   将任务所提供的数据集转化成squad格式，并使用bert官方提供的run_squad.py得到训练文件及对测试集的预测结果。
2.   调用"bert-as-server"框架，对预测结果与测试集两个answer进行比较，选取与预测结果余弦距离更近的answer作为正确answer



---



### 向squad格式转换：得到答案在原abstract中的answer_start



In [23]:
def answer_start(context, answer):
    context = list(context)
    answer = list(answer)
    
    cur_pos = 0
    ans_length = len(answer)
    con_length = len(context)
    while True:
        if context[cur_pos] == answer[0] and context[cur_pos:cur_pos+ans_length] == answer:    
            return cur_pos
        cur_pos += 1
        if cur_pos == con_length:
            return "unvalid"

### 将原.md文件转化为字典函数便于使用question编号索引问题实际内容

In [27]:
def get_question():
    q_dict ={
        'A1':'What is the objective / aim of this paper ?',
        'A2':'What problem(s) does this paper address ?',
        'A41':'What method / approach does this paper propose ?',
        'A51':'What is this method based on ?',
        'A61':'How does the proposed method differ from previous methods / approaches ?',
        'A42':'What model does this paper propose ?',
        'A52':'What is this model based on ?',
        'A62':'How does the proposed model differ from previous models ?',
        'A43':'What algorithm does this paper propose ?',
        'A53':'What is this algorithm based on ?',
        'A63':'How does the proposed algorithm differ from previous algorithms ?',
        'A44':'What framework does this paper propose ?',
        'A54':'What is this framework based on ?',
        'A64':'How does the proposed framework differ from previous frameworks ?',
        'A45':'What datasetdoes this paper propose ?',
        'A7':'What experiment does this paper carry out to evaluate the result ?',
        'A81':'What does the result of this paper show(demonstrated by the experiment)',
        'A82':'What does the result of this paper show(demonstrated by the experiment)',
        'A83':'What does the result of this paper show(demonstrated by the experiment)',
        'A10':'How does this result outperform existing work ?' 
        }
    return q_dict

### 通过问题编号得到问题实际字面内容

In [28]:
def get_liter_questions(ques_ID):
    return get_question()[ques_ID]

### 将原数据集转化为类squad的json格式

In [16]:
import csv
import json

def data2json():
    csvFile1 = open('train.csv', 'r', encoding='utf-8')
    train_file = csv.reader(csvFile1)
    
    train_data = {}
    train_data["data"] = []
    inner_data = {}
    inner_data["title"] = "train_data"
    inner_data["paragraphs"] = []

    Id = 0
    for line in train_file:
        # 忽略第一行
        if train_file.line_num == 1:
            continue
            
        ques_ID = line[0]    
        QA_abs = line[1]
        answer_text = line[2]
        
        Answer_start = answer_start(QA_abs, answer_text)
        if Answer_start == "unvalid":
            continue
        else:
            QA_info = {}
            QA_info["context"] = QA_abs
            QA_info["qas"] = []

            pure_QA = {}

            pure_QA["answers"] = [] 
            answer_info = {}
            answer_info["answer_start"]  = Answer_start

            answer_info["text"] = answer_text
            pure_QA["answers"].append(answer_info)
                
            pure_QA["question"] = get_liter_questions(ques_ID)
            pure_QA["id"] = Id
            Id += 1
            
            QA_info["qas"].append(pure_QA)

            inner_data["paragraphs"].append(QA_info)
        
    train_data["data"].append(inner_data)
    train_data["version"] = "0.0"
            
    with open('train.json', 'w') as f:
        json.dump(train_data, f)
        
#     with open("submit.json") as f:
#          pop_data = json.load(f)    

data2json()

In [30]:
def test_data2json():
    csvFile1 = open('/home/xuhui/10002/Machine_Learning/test.csv', 'r', encoding='utf-8')
    test_file = csv.reader(csvFile1)
    
    csvFile2 = open('/home/xuhui/10002/Machine_Learning/sample.csv', 'r', encoding='utf-8')
    sample_file = csv.reader(csvFile2)
    
    ID2Category = {}
    test_data = {}
    test_data["data"] = []
    inner_data = {}
    inner_data["title"] = "test_data"
    inner_data["paragraphs"] = []

    for line in sample_file:
        if sample_file.line_num == 1:
            continue
        ID2Category[line[0]] = line[1]
    
    Id = 0
    for line in test_file:
        # 忽略第一行
        if test_file.line_num == 1:
            continue
            
        ques_ID = line[1]    
        QA_abs = line[2]
        if ID2Category[line[0]] == '1':
            answer_text = line[3]
        elif ID2Category[line[0]] == '2':
            answer_text = line[4]
        
        Answer_start = answer_start(QA_abs, answer_text)
        if Answer_start == "unvalid":
            continue
        else:
            QA_info = {}
            QA_info["context"] = QA_abs
            QA_info["qas"] = []

            pure_QA = {}

            pure_QA["answers"] = [] 
            answer_info = {}
            answer_info["answer_start"]  = Answer_start

            answer_info["text"] = answer_text
            pure_QA["answers"].append(answer_info)
                
            pure_QA["question"] = get_liter_questions(ques_ID)
            pure_QA["id"] = Id
            Id += 1
            
            QA_info["qas"].append(pure_QA)

            inner_data["paragraphs"].append(QA_info)
        
    test_data["data"].append(inner_data)
    test_data["version"] = "0.0"
            
    with open('test.json', 'w') as f:
        json.dump(test_data, f)
        
#     with open("submit.json") as f:
#          pop_data = json.load(f)    

test_data2json()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141


3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117


### 运行bert模型，得到预测结果

In [2]:
!python bert-master/run_squad.py \
  --vocab_file=BERT_BASE_DIR/vocab.txt \
  --bert_config_file=BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=BERT_BASE_DIR/bert_model.ckpt \
  --do_train=True \
  --train_file=SQUAD_DIR/csv2squad_train.json \
  --do_predict=True \
  --predict_file=SQUAD_DIR/csv2squad_test.json \
  --train_batch_size=4 \
  --learning_rate=3e-5 \
  --num_train_epochs=2.0 \
  --max_seq_length=384 \
  --doc_stride=128 \
  --output_dir=bert_result


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

SQUAD_DIR/csv2squad_train.json




INFO:tensorflow:Using config: {'_model_dir': 'bert_result', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f47efdac748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_p

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 1000000009
INFO:tensorflow:example_index: 5
INFO:tensorflow:doc_span_index: 0
INFO:tensorflow:tokens: [CLS] what is this model based on ? [SEP] in this paper , we propose a ref ##ined scene text det ##ector with a novel feature en ##hance ##ment network ( fe ##n ) for region proposal and text detection ref ##ine ##ment . ret ##rospective ##ly , both region proposal with only 3 x 3 sl ##iding - window feature and text detection ref ##ine ##ment with single scale high level feature are ins ##uff ##icient , especially for smaller scene text . therefore , we design a new fe ##n network with task - specific , low and high level sem ##anti ##c features fusion to improve the performance of text detection . be ##sides , since unit ##ary position - sensitive roi pool ##ing in general object detection is un ##rea ##sona ##ble for variable text regions , an ada ##ptive ##ly weight ##ed position - sensitive roi pool ##ing layer is de #

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 1000000024
INFO:tensorflow:example_index: 13
INFO:tensorflow:doc_span_index: 0
INFO:tensorflow:tokens: [CLS] what is the objective / aim of this paper ? [SEP] with the expansion of data , increasing im ##bala ##nce ##d data has emerged . when the im ##bala ##nce ratio of data is high , most existing im ##bala ##nce ##d learning methods decline in classification performance . to address this problem , a few highly im ##bala ##nce ##d learning methods have been presented . however , most of them are still sensitive to the high im ##bala ##nce ratio . this work aims to provide an effective solution for the highly im ##bala ##nce ##d data classification problem . we conduct highly im ##bala ##nce ##d learning from the perspective of feature learning . we partition the majority class into multiple blocks with each being balance ##d to the minority class and combine each block with the minority class to construct a balance ##d sa

INFO:tensorflow:***** Running training *****
INFO:tensorflow:  Num orig examples = 11366
INFO:tensorflow:  Num split examples = 20833
INFO:tensorflow:  Batch size = 4
INFO:tensorflow:  Num steps = 5683
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running train on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = end_positions, shape = (4,)
INFO:tensorflow:  name = input_ids, shape = (4, 200)
INFO:tensorflow:  name = input_mask, shape = (4, 200)
INFO:tensorflow:  name = segment_ids, shape = (4, 200)
INFO:tensorflow:  name = start_positions, shape = (4,)
INFO:tensorflow:  name = unique_ids, shape = (4,)
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
INFO:t

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
2019-06-16 11:59:13.103398: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-06-16 11:59:13.135398: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2297355000 Hz
2019-06-16 11:59:13.138400: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x559e986d4640 executing computations on platform Host. Devices:
2019-06-16 11:59:13.138436: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): <undefined>, <undefined>
2019-06-16 11:59:13.961214: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x559e9b2bce40 executing computations on platform CUDA. Devices:
2019-06-16 11:59:13.961257: I tensorflow/compiler/xla/service/service.c

INFO:tensorflow:global_step/sec: 5.41239
INFO:tensorflow:examples/sec: 21.6496
INFO:tensorflow:Saving checkpoints for 4000 into bert_result/model.ckpt.
INFO:tensorflow:global_step/sec: 2.52856
INFO:tensorflow:examples/sec: 10.1142
INFO:tensorflow:global_step/sec: 5.55838
INFO:tensorflow:examples/sec: 22.2335
INFO:tensorflow:global_step/sec: 5.45992
INFO:tensorflow:examples/sec: 21.8397
INFO:tensorflow:global_step/sec: 5.36482
INFO:tensorflow:examples/sec: 21.4593
INFO:tensorflow:global_step/sec: 5.33098
INFO:tensorflow:examples/sec: 21.3239
INFO:tensorflow:global_step/sec: 5.35504
INFO:tensorflow:examples/sec: 21.4202
INFO:tensorflow:global_step/sec: 5.4076
INFO:tensorflow:examples/sec: 21.6304
INFO:tensorflow:global_step/sec: 5.38427
INFO:tensorflow:examples/sec: 21.5371
INFO:tensorflow:global_step/sec: 5.38596
INFO:tensorflow:examples/sec: 21.5439
INFO:tensorflow:global_step/sec: 5.40493
INFO:tensorflow:examples/sec: 21.6197
INFO:tensorflow:Saving checkpoints for 5000 into bert_resul

INFO:tensorflow:input_ids: 101 14796 15107 10531 14493 10950 12713 19757 26636 11424 136 102 118 13435 27153 18576 10114 80870 22494 10108 10105 11561 27404 10146 16065 10146 14128 10111 43250 66191 10948 22494 10188 12902 27237 10146 11206 10146 27237 10188 12902 27404 22441 119 10106 10105 42671 10123 19072 16199 117 11408 69617 118 13596 62474 14010 10111 22021 118 13596 44771 10301 11206 75980 24367 100308 10111 10105 14444 103675 23058 10115 18527 10948 15374 22062 102374 10105 11531 24440 10350 13213 11949 10108 27404 22494 119 14586 19594 117 11951 23332 10380 13301 10105 15363 27404 11639 118 12604 11165 14488 107 27404 118 122 10147 117 107 10319 53963 21377 122 12473 22494 25030 10106 13547 58553 58496 10107 119 34176 17466 10135 107 27404 118 122 10147 107 10111 107 27404 11249 107 98343 10105 71577 11195 10108 17446 21622 23068 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1

INFO:tensorflow:token_to_orig_map: 17:0 18:1 19:2 20:3 21:4 22:5 23:5 24:6 25:7 26:8 27:9 28:9 29:9 30:10 31:11 32:11 33:12 34:13 35:14 36:15 37:15 38:16 39:17 40:18 41:19 42:20 43:21 44:22 45:23 46:24 47:25 48:26 49:27 50:28 51:29 52:30 53:31 54:32 55:33 56:33 57:34 58:35 59:36 60:37 61:38 62:39 63:40 64:41 65:42 66:43 67:44 68:45 69:46 70:47 71:47 72:47 73:48 74:49 75:50 76:51 77:52 78:53 79:54 80:54 81:55 82:56 83:57 84:58 85:59 86:60 87:61 88:62 89:62 90:63 91:64 92:64 93:64 94:64 95:65 96:66 97:66 98:67 99:67 100:68 101:69 102:69 103:70 104:71 105:72 106:73 107:73 108:73 109:74 110:74 111:74 112:75 113:76 114:77 115:78 116:78 117:78 118:79 119:80 120:81 121:82 122:83 123:84 124:85 125:86 126:87 127:87 128:88 129:89 130:90 131:91 132:91 133:91 134:92 135:93 136:94 137:95 138:96 139:97 140:98 141:99 142:100 143:100 144:100 145:101 146:102 147:103 148:104 149:104 150:105 151:105 152:105 153:106 154:107 155:108 156:109 157:110 158:111 159:112 160:113 161:114 162:115 163:116 164:116 16

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 1000000036
INFO:tensorflow:example_index: 19
INFO:tensorflow:doc_span_index: 0
INFO:tensorflow:tokens: [CLS] what is the objective / aim of this paper ? [SEP] logic - based ben ##ders de ##com ##position ( lb ##b ##d ) is a powerful hybrid op ##timi ##sation technique that can combine the strong dual bound ##s of mixed inte ##ger programming ( mi ##p ) with the com ##bina ##torial search strength ##s of con ##stra ##int programming ( c ##p ) . a major draw ##back of lb ##b ##d is that it is a far more involved process to implement an lb ##b ##d solution to a problem than the " model - and - run " approach provided by both c ##p and mi ##p . we propose an auto ##mated approach that accept ##s an ar ##bit ##rary mini ##zin ##c model and solve ##s it using lb ##b ##d with no additional intervention on the part of the modell ##er . the design of this approach also reveals an interesting dual ##ity between lb ##b ##d and large n

INFO:tensorflow:***** Running predictions *****
INFO:tensorflow:  Num orig examples = 4927
INFO:tensorflow:  Num split examples = 9073
INFO:tensorflow:  Batch size = 8
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (?, 200)
INFO:tensorflow:  name = input_mask, shape = (?, 200)
INFO:tensorflow:  name = segment_ids, shape = (?, 200)
INFO:tensorflow:  name = unique_ids, shape = (?,)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (119547, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), 

INFO:tensorflow:Graph was finalized.
2019-06-16 12:20:06.765638: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible gpu devices: 0, 1, 2, 3
2019-06-16 12:20:06.765867: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-06-16 12:20:06.765878: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990]      0 1 2 3 
2019-06-16 12:20:06.765895: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0:   N Y N N 
2019-06-16 12:20:06.765899: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 1:   Y N N N 
2019-06-16 12:20:06.765902: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 2:   N N N Y 
2019-06-16 12:20:06.765906: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 3:   N N Y N 
2019-06-16 12:20:06.766852: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 9247 MB memory) -> physical GPU (de

### 调用"bert-as-server"框架

In [4]:
!pip install bert-serving-server

Collecting bert-serving-server
  Using cached https://files.pythonhosted.org/packages/5e/3e/44d79e1a739b8619760051410c61af67f95477c87fbe43e3e9426427feb5/bert_serving_server-1.9.1-py3-none-any.whl
Collecting termcolor>=1.1 (from bert-serving-server)
  Using cached https://files.pythonhosted.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz
Collecting GPUtil>=1.3.0 (from bert-serving-server)
  Using cached https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: termcolor, GPUtil
  Building wheel for termcolor (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/xuhui/.cache/pip/wheels/7c/06/54/bc84598ba1daf8f970247f550b175aaaee85f68b4b0c5ab2c6
  Building wheel for GPUtil (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/xuhui/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Succe

In [6]:
!bert-serving-start -model_dir '/home/xuhui/10002/Machine_Learning/multi_cased_L-12_H-768_A-12/multi_cased_L-12_H-768_A-12'/ -num_worker=4 

usage: /home/xuhui/anaconda3/bin/bert-serving-start -model_dir /home/xuhui/10002/Machine Learning/multi_cased_L-12_H-768_A-12/multi_cased_L-12_H-768_A-12/ -num_worker=4
                 ARG   VALUE
__________________________________________________
           ckpt_name = bert_model.ckpt
         config_name = bert_config.json
                cors = *
                 cpu = False
          device_map = []
       do_lower_case = True
  fixed_embed_length = False
                fp16 = False
 gpu_memory_fraction = 0.5
       graph_tmp_dir = None
    http_max_connect = 10
           http_port = None
        mask_cls_sep = False
      max_batch_size = 256
         max_seq_len = 25
           model_dir = /home/xuhui/10002/Machine Learning/multi_cased_L-12_H-768_A-12/multi_cased_L-12_H-768_A-12/
          num_worker = 4
       pooling_layer = [-2]
    pooling_strategy = REDUCE_MEAN
                port = 5555
            port_out = 5556
       prefetch_size = 10
 priority_batch_size = 16
show

### 利用Bert构建句向量并计算比较预测结果与供选answer的相似度，同时得到提交文件

In [None]:
#all nbest_prediction
import csv
import json
from bert_serving.client import BertClient
import numpy as np

def cosine(a,b):
    return a.dot(b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_submission():
    csvFile_test = open('/home/xuhui/10002/Machine_Learning/test.csv', 'r', encoding='UTF-8')
    test_file = csv.reader(csvFile_test)
    
    csvFile_submit = open('submission_n.csv','w',newline ='')
    submission = csv.writer(csvFile_submit)
    fileHeader = ["Id","Category"]
    submission.writerow(fileHeader)
    
#     with open("predictions/predictions.json") as f:
#         prediction = json.load(f)

    with open("predictions/nbest_predictions.json") as f:
        nbest_prediction = json.load(f)
    print('load完成')
        
    bc = BertClient()  
    ID = 1
    for line in test_file:
        # 忽略第一行
        if test_file.line_num == 1:
            continue
        else:
            ans1 = line[3]
            ans2 = line[4]

            dis1 = 0
            dis2 = 0
            for pre in nbest_prediction[str(ID)]:
                dis1 += np.array(bc.encode([pre['text'], ans1]))
                dis2 += np.array(bc.encode([pre['text'], ans2]))
                
            #print(ID,dis1,dis2)
            if np.linalg.norm(dis1) <= np.linalg.norm(dis2):
                submission.writerow([ID, 1])
            else:
                submission.writerow([ID, 2])
        ID = ID + 1
        print(ID)
    csvFile_test.close()
    csvFile_submit.close()

get_submission()

load完成


here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
27

In [55]:
def evaluate():
    csvFile1 = open('/home/xuhui/10002/Machine_Learning/sample.csv', 'r', encoding='utf-8')
    sample_file = csv.reader(csvFile1)
    
    csvFile2 = open('/home/xuhui/10002/Machine_Learning/submission_n.csv', 'r', encoding='utf-8')
    pre_file = csv.reader(csvFile2)
    
    True_ans = []
    Pre_ans = []
    
    for line in sample_file:
        if sample_file.line_num == 1:
            continue
        True_ans.append(int(line[1]))
    
    for line in pre_file:
        if pre_file.line_num == 1:
            continue
        Pre_ans.append(int(line[1]))
    
    real = []
    for i in True_ans:
        if i == 1:
            real.extend([1,0])
        elif i == 2:
            real.extend([0,1])
    pre = []
    for i in Pre_ans:
        if i == 1:
            pre.extend([1,0])
        elif i == 2:
            pre.extend([0,1])
    
    real = np.array(real)
    pre = np.array(pre)
    precision = ((real==1)*(pre==1)).sum()/(pre==1).sum()
    recall = ((real==1)*(pre==1)).sum()/(real==1).sum()
    F1 = precision*recall/(precision+recall)
    print(precision,F1)

evaluate()

0.49198295108585344 0.24599147554292672
