# import lib

In [None]:
#参考https://blog.csdn.net/ZesenChen/article/details/84347553

In [1]:
import pandas as pd 
import tensorflow as tf
import numpy as np

import os 
import sys 
module_path = os.path.abspath(os.path.join('..')) 
if module_path not in sys.path: 
    sys.path.append(module_path)

# load data

In [2]:
csv_train = pd.read_csv("train.csv.zip", compression='zip')
csv_test = pd.read_csv("test.csv.zip", compression='zip')
csv_train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
csv_test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [4]:
train_labels = csv_train['target']
train_labels.value_counts(normalize=True)

0    0.93813
1    0.06187
Name: target, dtype: float64

In [5]:
display(csv_train.isnull().sum())
display(csv_test.isnull().sum())

qid              0
question_text    0
target           0
dtype: int64

qid              0
question_text    0
dtype: int64

# preprocess data

In [6]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
csv_train['question_text']= csv_train['question_text'].apply(lambda x : x.lower())
csv_test['question_text'] = csv_test['question_text'].apply(lambda x : x.lower())

In [8]:
csv_train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,how did quebec nationalists see their province...,0
1,000032939017120e6e44,"do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,why does velocity affect time? does velocity a...,0
3,000042bf85aa498cd78e,how did otto von guericke used the magdeburg h...,0
4,0000455dfa3e01eae3af,can i convert montra helicon d to a mountain b...,0


In [9]:
#split train and test
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(csv_train['question_text'], csv_train['target'], test_size = 0.2, train_size = 0.5)
# train_test_split()

In [10]:
from mypackage import util
vocab_size = 50000
doc_maxlen = 70

train_X = util.onehot_postpad_docs(train_X, vocab_size, doc_maxlen)
test_X = util.onehot_postpad_docs(test_X, vocab_size, doc_maxlen)

Using TensorFlow backend.


onehot and padded shape : (653061, 70) 
onehot and padded shape : (261225, 70) 


In [12]:
print(train_X[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 46512 19743 43927 49756 49056 47079 27035 29649 47079  5147]


In [6]:
# vocab_size = 50000
# onehot_docs = [one_hot(doc, vocab_size) for doc in csv_train['question_text']]

# doc_maxlen = 70
# padded_docs = pad_sequences(onehot_docs, maxlen=doc_maxlen, padding='post')

# print(padded_docs.shape)

(1306122, 70)


# define model

In [29]:
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPool2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Reshape

In [30]:
def create_model(vocab_size, doc_maxlen):
    inputs = Input(shape = (doc_maxlen, ))           
    embedding_size = 50
    x = Embedding(input_dim = vocab_size, output_dim = embedding_size, input_length = doc_maxlen)(inputs)
    x = Reshape((doc_maxlen, embedding_size, 1))(x)
    filter_num = 8
    filter_conv_size = (2, embedding_size)
    x = Conv2D(filters = filter_num, kernel_size = filter_conv_size)(x)
    x = MaxPool2D(pool_size = (doc_maxlen - filter_conv_size[0] + 1, 1))(x)
    x = Flatten()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inputs, outputs = outputs)
    return model
 
model = create_model(vocab_size, doc_maxlen)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 70)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 70, 50)            2500000   
_________________________________________________________________
reshape_1 (Reshape)          (None, 70, 50, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 69, 1, 8)          808       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 1, 8)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 8)                 0         
__________

# train model

In [31]:
model.fit(train_X, train_y, epochs=5, verbose=1, validation_data=(test_X, test_y))

Train on 653061 samples, validate on 261225 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x18fd56710>

In [32]:
loss, acc = model.evaluate(test_X, test_y)
print(loss, acc)

0.12285197804582576 0.95227104


In [33]:
test_probs = model.predict(test_X)

In [34]:
print(test_probs)

[[0.00900665]
 [0.05216676]
 [0.02460027]
 ...
 [0.01335958]
 [0.00236544]
 [0.00790581]]


In [35]:
from mypackage import util
max_f1_score = util.compute_f1_maxscore(test_y, test_probs)
print("max_f1_score=", max_f1_score)

F1 score at threshold 0.1 is 0.5143650823851744
F1 score at threshold 0.11 is 0.5259902696527408
F1 score at threshold 0.12 is 0.535831743067116
F1 score at threshold 0.13 is 0.5441397759571978
F1 score at threshold 0.14 is 0.550984513748874
F1 score at threshold 0.15 is 0.5574137968925126
F1 score at threshold 0.16 is 0.5638455660928581
F1 score at threshold 0.17 is 0.569094122237268
F1 score at threshold 0.18 is 0.5733027222040014
F1 score at threshold 0.19 is 0.5769138927360429
F1 score at threshold 0.2 is 0.5799868494751966
F1 score at threshold 0.21 is 0.582824825122786
F1 score at threshold 0.22 is 0.5851799505325324
F1 score at threshold 0.23 is 0.5878426943217989
F1 score at threshold 0.24 is 0.589509394572025
F1 score at threshold 0.25 is 0.5918399830220713
F1 score at threshold 0.26 is 0.5935716593087111
F1 score at threshold 0.27 is 0.5946034610022144
F1 score at threshold 0.28 is 0.5958284795029678
F1 score at threshold 0.29 is 0.5969166713554267
F1 score at threshold 0.3 i

In [42]:
# from mypackage import util
from sklearn import metrics
def compute_auc(label_y, pred_y_prob):
    """
    type = {0, 1}
    """
    fprs, tprs, thresholds = metrics.roc_curve(label_y, pred_y_prob)
    
#     for i,  (fpr, tpr, thres) in enumerate(zip(fprs, tprs, thresholds)):
#         print("The {} thresh value={} computes fpr={}, tpr={}".format(i, thres, fpr, tpr))
    auc = metrics.auc(fprs, tprs) 
    print("auc = ", auc)
    return auc

auc = compute_auc(test_y, test_probs)
# print("auc=", auc)

auc =  0.943407723601038


# predict

In [43]:
from mypackage import util
test_padded_docs = util.onehot_postpad_docs(csv_test['question_text'], vocab_size, doc_maxlen)

onehot and padded shape : (375806, 70) 


In [46]:
pred_probs = model.predict(test_padded_docs, batch_size=16)

In [47]:
print(pred_probs)
def convert_prob_to_label(label_probs, prob_threshold:float,  positive_label,  negative_label):
    labels = [ positive_label if prob > prob_threshold  else  negative_label for prob in label_probs ]
    return labels
print(convert_prob_to_label(pred_labels, 0.3, 0, 1))

[[0.80515873]
 [0.00241938]
 [0.00436732]
 ...
 [0.00110356]
 [0.00120443]
 [0.08574923]]
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 