In [27]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

In [28]:
import re
import string
def clean_text(text):
    text = text.lower()
    text = text.encode('ascii', 'ignore').decode('utf-8') 
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  #网址
    text = re.sub(r'@\S+', ' ', text)  # 艾特
    text = re.sub(r"'", ' ', text)  
    text = re.sub(r'%20', ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text) # 12abc12
    text = re.sub(r' \d+ ', ' ', text)
    table = str.maketrans('','',string.punctuation) # 其他符号
    text = text.translate(table)
    tmp = text.split()
    text = " ".join(list(filter(lambda x: x not in sklearn_stop_words,tmp)))
    return text

In [29]:
train_df = pd.read_csv("train.csv")[:6000]
val_df = pd.read_csv("train.csv")[6000:]
test_df = pd.read_csv("test.csv")

train_df.keyword = train_df.keyword.fillna("")
test_df.keyword = test_df.keyword.fillna("")
val_df.keyword = val_df.keyword.fillna("")

train_df["text"] = train_df["text"] +" "+ train_df["keyword"]
test_df["text"] = test_df["text"] +" "+ test_df["keyword"]
val_df["text"] = val_df["text"] +" "+ val_df["keyword"]

train_df["text"] = train_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)
val_df["text"] = val_df["text"].apply(clean_text)
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake allah forgive,1
1,4,,,forest near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,people receive wildfires evacuation orders cal...,1
4,7,,,just got sent photo ruby alaska smoke wildfire...,1
...,...,...,...,...,...
5995,8560,screams,BrasÌ_lia,echoes screams screams,0
5996,8561,screams,POFFIN,bts song jimin screams screams,0
5997,8562,screams,#Gladiator Û¢860Û¢757Û¢,casually phone jasmine cries screams spider sc...,0
5998,8567,screams,W.I.T.S Academy,screams screams,0


In [30]:
from gensim.models.word2vec import Word2Vec
df = pd.read_csv("train.csv")
df.keyword = df.keyword.fillna("")
df["text"] = df["text"] +" "+ df["keyword"]
df["text"] = df["text"].apply(clean_text)
token_list = [l.split() for l in df.text]
num_features = 32
min_word_count = 1
num_workers = 2
window_size = 3  # 上下文窗口大小
subsampling = 1e-3 # 高频词条降采样率

model = Word2Vec(token_list, workers=num_workers, vector_size=num_features, 
                 min_count=min_word_count,window=window_size, sample=subsampling)


# 保存已训练的模型
model_name = "twitter_32_word2vec_model"
model.save(model_name)

# # 测试模型效果

In [31]:
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
from keras import regularizers
from keras.preprocessing import sequence

In [32]:
max_words = 12000
embedding_dim = 32
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(train_df.text)

In [33]:
tok.index_word

{1: 's',
 2: 't',
 3: 'emergency',
 4: 'just',
 5: 'like',
 6: 'amp',
 7: 'body',
 8: 'disaster',
 9: 'burning',
 10: 'buildings',
 11: 'm',
 12: 'new',
 13: 'people',
 14: 'accident',
 15: 'news',
 16: 'nuclear',
 17: 'police',
 18: 'collapse',
 19: 'don',
 20: 'mass',
 21: 'crash',
 22: 'fires',
 23: 'video',
 24: 'attack',
 25: 'forest',
 26: 'dead',
 27: 'death',
 28: 'fatal',
 29: 'flames',
 30: 'fear',
 31: 'california',
 32: 'damage',
 33: 'man',
 34: 'day',
 35: 'floods',
 36: 'bomb',
 37: 'got',
 38: 'know',
 39: 'flood',
 40: 'storm',
 41: 'injured',
 42: 'flooding',
 43: 'time',
 44: 'going',
 45: 'debris',
 46: 'fatalities',
 47: 'evacuation',
 48: 'bags',
 49: 'oil',
 50: 'army',
 51: 'world',
 52: 'truck',
 53: 'today',
 54: 'military',
 55: 'evacuate',
 56: 'outbreak',
 57: 'love',
 58: 'derailment',
 59: 'spill',
 60: 'explosion',
 61: 'plan',
 62: 'harm',
 63: 'collided',
 64: 'deluge',
 65: 'screaming',
 66: 'destroy',
 67: 'deaths',
 68: 'bloody',
 69: 'ambulance',
 

In [34]:
model_name = "twitter_32_word2vec_model"
Word2Vec_model = Word2Vec.load(model_name)
weight = np.zeros((max_words,embedding_dim))
for i in range(1,max_words):
    weight[i] = Word2Vec_model.wv[tok.index_word.get(i,'?')]

In [35]:
train_seq = tok.texts_to_sequences(train_df.text)
val_seq = tok.texts_to_sequences(val_df.text)
test_seq = tok.texts_to_sequences(test_df.text)

train_matrix = tok.texts_to_matrix(train_df.text)
val_matrix = tok.texts_to_matrix(val_df.text)
test_matrix = tok.texts_to_matrix(test_df.text)

In [36]:
max_len = 25

In [37]:
train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)
test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)
val_seq_mat = sequence.pad_sequences(val_seq,maxlen=max_len)

## DNN

In [38]:
model = models.Sequential()
model.add(layers.Dense(32, activation='relu',input_shape=(max_words,)))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss = 'binary_crossentropy',
             metrics=["accuracy"])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 32)                384032    
_________________________________________________________________
dense_13 (Dense)             (None, 8)                 264       
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 9         
Total params: 384,305
Trainable params: 384,305
Non-trainable params: 0
_________________________________________________________________


In [39]:
history_dense = model.fit(train_matrix,
                    train_df.target,
                   epochs=20,
                   batch_size = 128,
                   validation_data =(val_matrix,val_df.target))

Train on 6000 samples, validate on 1613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## RNN

In [40]:
model = models.Sequential()
model.add(layers.Embedding(input_dim=max_words, output_dim=embedding_dim,
                           input_length = max_len))

model.add(layers.SimpleRNN(32))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss = 'binary_crossentropy',
             metrics=["accuracy"])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 25, 32)            384000    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 33        
Total params: 386,113
Trainable params: 386,113
Non-trainable params: 0
_________________________________________________________________


In [41]:
history_rnn = model.fit(train_seq_mat,
                    train_df.target,
                   epochs=20,
                   batch_size = 128,
                   validation_data =(val_seq_mat,val_df.target))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6000 samples, validate on 1613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [42]:
model = models.Sequential()
model.add(layers.Embedding(input_dim=max_words, output_dim=embedding_dim,
                           input_length = max_len,
                           weights=[weight],
                           trainable=False))

model.add(layers.SimpleRNN(32))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss = 'binary_crossentropy',
             metrics=["accuracy"])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 25, 32)            384000    
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 33        
Total params: 386,113
Trainable params: 2,113
Non-trainable params: 384,000
_________________________________________________________________


In [43]:
history_rnn_pretrain = model.fit(train_seq_mat,
                    train_df.target,
                   epochs=20,
                   batch_size = 128,
                   validation_data =(val_seq_mat,val_df.target))

Train on 6000 samples, validate on 1613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## LSTM

In [77]:
model = models.Sequential()
model.add(layers.Embedding(input_dim=max_words, output_dim=embedding_dim,
                           input_length = max_len))


model.add(layers.LSTM(32))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss = 'binary_crossentropy',
             metrics=["accuracy"])
model.summary()

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 25, 32)            384000    
_________________________________________________________________
lstm_14 (LSTM)               (None, 32)                8320      
_________________________________________________________________
dense_46 (Dense)             (None, 1)                 33        
Total params: 392,353
Trainable params: 392,353
Non-trainable params: 0
_________________________________________________________________


In [78]:
history_lstm = model.fit(train_seq_mat,
                    train_df.target,
                   epochs=20,
                   batch_size = 128,
                   validation_data =(val_seq_mat,val_df.target))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6000 samples, validate on 1613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [59]:
model = models.Sequential()
model.add(layers.Embedding(input_dim=max_words, output_dim=embedding_dim,
                           input_length = max_len,
                          weights = [weight],
                          trainable=False))


model.add(layers.LSTM(32))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss = 'binary_crossentropy',
             metrics=["accuracy"])
model.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 25, 32)            384000    
_________________________________________________________________
lstm_10 (LSTM)               (None, 25, 16)            3136      
_________________________________________________________________
lstm_11 (LSTM)               (None, 16)                2112      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_28 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 17        
Total params: 389,537
Trainable params: 5,537
Non-trainable params: 384,000
___________________________________________

In [60]:
history_lstm_pretrain = model.fit(train_seq_mat,
                    train_df.target,
                   epochs=20,
                   batch_size = 128,
                   validation_data =(val_seq_mat,val_df.target))

Train on 6000 samples, validate on 1613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## CNN

In [70]:
model = models.Sequential()
model.add(layers.Embedding(input_dim=max_words, output_dim=embedding_dim,
                           input_length = max_len))

model.add(layers.Conv1D(32, 3, activation='relu'))
model.add(layers.MaxPooling1D(3))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss = 'binary_crossentropy',
             metrics=["accuracy"])
model.summary()

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 25, 32)            384000    
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 23, 32)            3104      
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 7, 32)             0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 224)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 32)                7200      
_________________________________________________________________
dense_41 (Dense)             (None, 1)                 33        
Total params: 394,337
Trainable params: 394,337
Non-trainable params: 0
_______________________________________________

In [71]:
history_cnn = model.fit(train_seq_mat,
                    train_df.target,
                   epochs=20,
                   batch_size = 128,
                   validation_data =(val_seq_mat,val_df.target))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6000 samples, validate on 1613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [50]:
model = models.Sequential()
model.add(layers.Embedding(input_dim=max_words, output_dim=embedding_dim,
                           input_length = max_len,
                          weights = [weight],
                          trainable=False))

model.add(layers.Conv1D(32, 3, activation='relu'))
model.add(layers.MaxPooling1D(3))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
             loss = 'binary_crossentropy',
             metrics=["accuracy"])
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 25, 32)            384000    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 23, 32)            3104      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 7, 32)             0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 224)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 32)                7200      
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 33        
Total params: 394,337
Trainable params: 10,337
Non-trainable params: 384,000
__________________________________________

In [51]:
history_cnn_pretrain = model.fit(train_seq_mat,
                    train_df.target,
                   epochs=20,
                   batch_size = 128,
                   validation_data =(val_seq_mat,val_df.target))

Train on 6000 samples, validate on 1613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [52]:
np.save('history_dense.npy', history_dense.history)

np.save('history_rnn.npy', history_rnn.history)
np.save('history_rnn_pretrain.npy', history_rnn_pretrain.history)


np.save('history_lstm.npy', history_lstm.history)
np.save('history_lstm_pretrain.npy', history_lstm_pretrain.history)

np.save('history_cnn.npy', history_cnn.history)
np.save('history_cnn_pretrain.npy', history_cnn_pretrain.history)