In [2]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt 
import time

In [3]:
df = pd.read_csv("C:/Users/zzy/DL_dataset/disaster_nlp_pred_train.csv")

In [4]:
df.shape

(7613, 5)

In [5]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


1代表灾难。0表示不是灾难

然后我们查看一下灾难和非灾难的总数各是多少

In [6]:
print((df.target==1).sum())
print((df.target==0).sum())

3271
4342


接下来我们进行数据的预处理

In [22]:
import re #正则表达式
import string
def remove_url(text):
    url=re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",text)
def remove_punct(text):
    translator = str.maketrans("","",string.punctuation)
    return text.translate(translator)

In [23]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

下面我们来看一个删除的例子

In [24]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"",t))
    if len(matches)>0:
        break

我们再删除一些stop words

In [25]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zzy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
stop = set(stopwords.words("english"))
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [27]:
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [28]:
df["text"] = df.text.map(remove_stopwords)
df["text"] = df.text.map(remove_url)
df["text"] = df.text.map(remove_punct)

In [29]:
df.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

我们来统计一下每一个唯一单词的个数

In [30]:
from collections import Counter
def counter_word(text_col):
    count=Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

In [31]:
counter = counter_word(df.text)
counter

Counter({'deeds': 2,
         'reason': 20,
         'earthquake': 50,
         'may': 88,
         'allah': 9,
         'forgive': 2,
         'us': 164,
         'forest': 65,
         'fire': 250,
         'near': 54,
         'la': 25,
         'ronge': 1,
         'sask': 1,
         'canada': 11,
         'residents': 8,
         'asked': 9,
         'shelter': 6,
         'place': 26,
         'notified': 1,
         'officers': 8,
         'evacuation': 50,
         'orders': 11,
         'expected': 15,
         '13000': 4,
         'people': 196,
         'receive': 2,
         'wildfires': 11,
         'california': 117,
         'got': 112,
         'sent': 13,
         'photo': 41,
         'ruby': 1,
         'alaska': 6,
         'smoke': 48,
         'pours': 1,
         'school': 66,
         'rockyfire': 4,
         'update': 37,
         'hwy': 9,
         '20': 26,
         'closed': 20,
         'directions': 1,
         'due': 31,
         'lake': 14,
         'co

In [32]:
len(counter)

18075

In [33]:
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

In [37]:
num_unique_words=len(counter)

将数据分离为训练集和验证集

In [34]:
train_size = int(df.shape[0]*0.8)
train_df = df[:train_size]
val_df = df[train_size:]

分离数据为文本和标签

In [35]:
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [36]:
train_sentences.shape ,val_sentences.shape

((6090,), (1523,))

标记器

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)

每一个单词都有一个唯一的索引

In [40]:
word_index =tokenizer.word_index 
word_index

{'like': 1,
 'amp': 2,
 'fire': 3,
 'im': 4,
 'get': 5,
 'via': 6,
 'new': 7,
 'people': 8,
 'news': 9,
 'emergency': 10,
 'one': 11,
 '2': 12,
 'us': 13,
 'video': 14,
 'disaster': 15,
 'burning': 16,
 'body': 17,
 'would': 18,
 'buildings': 19,
 'police': 20,
 'crash': 21,
 'first': 22,
 'california': 23,
 'still': 24,
 'man': 25,
 'got': 26,
 'know': 27,
 'day': 28,
 'back': 29,
 'going': 30,
 'two': 31,
 'time': 32,
 'full': 33,
 'accident': 34,
 'see': 35,
 'world': 36,
 'attack': 37,
 'nuclear': 38,
 'youtube': 39,
 'may': 40,
 'love': 41,
 'go': 42,
 'rt': 43,
 'many': 44,
 'cant': 45,
 '3': 46,
 'watch': 47,
 'collapse': 48,
 'dead': 49,
 'today': 50,
 'car': 51,
 'mass': 52,
 'want': 53,
 'years': 54,
 'work': 55,
 'train': 56,
 'last': 57,
 'good': 58,
 'think': 59,
 'families': 60,
 'hiroshima': 61,
 'life': 62,
 'fires': 63,
 'best': 64,
 'could': 65,
 'say': 66,
 'u': 67,
 'death': 68,
 'hot': 69,
 'forest': 70,
 'way': 71,
 'killed': 72,
 'need': 73,
 'legionnaires': 74,


现在我们再将原始文本变成索引文本

In [41]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [42]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar we arrived bago'
 'damage school bus 80 multi car crash breaking']
[[527, 8, 399, 155, 301, 415], [760, 478, 2280, 137, 2281, 2853, 528, 620, 189, 478, 2280, 190, 190, 5749, 115], [2854, 115, 1914, 5750, 2280, 1307, 1475, 529, 259, 653, 2855], [97, 3791, 621, 1089, 1476, 3791], [109, 90, 340, 3792, 3793, 51, 21, 316]]


现在我们发现每个sentences的文本长度不同，每个索引文本之间的长度也不相同。<br>
但我们希望是相同的<br>
所以我们来填充索引文本

In [43]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = 20
train_padded = pad_sequences(train_sequences,maxlen=max_length,padding="post",truncating="post")
val_padded = pad_sequences(val_sequences,maxlen=max_length,padding="post",truncating="post")
train_padded.shape,val_padded.shape

((6090, 20), (1523, 20))

上述看来，我们把句子的列都变成20了

现在我们来打印一个填充序列

In [44]:
train_padded[10]

array([527,   8, 399, 155, 301, 415,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

上边的填充序列可知，我们的填充是填充序号0

In [45]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

three people died heat wave far
[527, 8, 399, 155, 301, 415]
[527   8 399 155 301 415   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


上面我们的word_index型式是这样的“like:1”,但是我们现在想让它转换成“1:like”这样的

In [46]:
reverse_word_index = dict([(idx,word) for (word,idx) in word_index.items()])
reverse_word_index

{1: 'like',
 2: 'amp',
 3: 'fire',
 4: 'im',
 5: 'get',
 6: 'via',
 7: 'new',
 8: 'people',
 9: 'news',
 10: 'emergency',
 11: 'one',
 12: '2',
 13: 'us',
 14: 'video',
 15: 'disaster',
 16: 'burning',
 17: 'body',
 18: 'would',
 19: 'buildings',
 20: 'police',
 21: 'crash',
 22: 'first',
 23: 'california',
 24: 'still',
 25: 'man',
 26: 'got',
 27: 'know',
 28: 'day',
 29: 'back',
 30: 'going',
 31: 'two',
 32: 'time',
 33: 'full',
 34: 'accident',
 35: 'see',
 36: 'world',
 37: 'attack',
 38: 'nuclear',
 39: 'youtube',
 40: 'may',
 41: 'love',
 42: 'go',
 43: 'rt',
 44: 'many',
 45: 'cant',
 46: '3',
 47: 'watch',
 48: 'collapse',
 49: 'dead',
 50: 'today',
 51: 'car',
 52: 'mass',
 53: 'want',
 54: 'years',
 55: 'work',
 56: 'train',
 57: 'last',
 58: 'good',
 59: 'think',
 60: 'families',
 61: 'hiroshima',
 62: 'life',
 63: 'fires',
 64: 'best',
 65: 'could',
 66: 'say',
 67: 'u',
 68: 'death',
 69: 'hot',
 70: 'forest',
 71: 'way',
 72: 'killed',
 73: 'need',
 74: 'legionnaires',


现在我们已经有了索引序列和其对应的文本序列，那么我们来试一下，如何通过索引读到对应的文本

In [47]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx,"?") for idx in sequence])

In [48]:
decode_text = decode(train_sequences[10])
print(train_sequences[10])
print(decode_text)

[527, 8, 399, 155, 301, 415]
three people died heat wave far


根据上述的字典来看，解码是对的

现在我们来训练我们的模型

In [51]:
from tensorflow.keras import layers
model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words,32,input_length=max_length))
model.add(layers.LSTM(64,dropout=0.1))
model.add(layers.Dense(1,activation="sigmoid"))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            578400    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 603,297
Trainable params: 603,297
Non-trainable params: 0
_________________________________________________________________


In [53]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics=["accuracy"]
model.compile(loss=loss,optimizer = optim,metrics=metrics)

In [54]:
epochs = 30
early_stopping= keras.callbacks.EarlyStopping(
    monitor="loss",
    patience=5,
    verbose=2
)
model.fit(train_padded,
          train_labels,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(val_padded,val_labels),
          verbose=2)

Epoch 1/30
191/191 - 4s - loss: 0.5488 - accuracy: 0.7120 - val_loss: 0.4685 - val_accuracy: 0.7807 - 4s/epoch - 18ms/step
Epoch 2/30
191/191 - 2s - loss: 0.2892 - accuracy: 0.8921 - val_loss: 0.5209 - val_accuracy: 0.7741 - 2s/epoch - 9ms/step
Epoch 3/30
191/191 - 2s - loss: 0.1642 - accuracy: 0.9433 - val_loss: 0.7296 - val_accuracy: 0.7551 - 2s/epoch - 9ms/step
Epoch 4/30
191/191 - 2s - loss: 0.1069 - accuracy: 0.9677 - val_loss: 0.8024 - val_accuracy: 0.7630 - 2s/epoch - 9ms/step
Epoch 5/30
191/191 - 2s - loss: 0.0849 - accuracy: 0.9749 - val_loss: 0.7870 - val_accuracy: 0.7472 - 2s/epoch - 9ms/step
Epoch 6/30
191/191 - 2s - loss: 0.0798 - accuracy: 0.9767 - val_loss: 0.8320 - val_accuracy: 0.7354 - 2s/epoch - 8ms/step
Epoch 7/30
191/191 - 2s - loss: 0.0641 - accuracy: 0.9785 - val_loss: 0.9544 - val_accuracy: 0.7426 - 2s/epoch - 9ms/step
Epoch 8/30
191/191 - 2s - loss: 0.0486 - accuracy: 0.9816 - val_loss: 1.0218 - val_accuracy: 0.7426 - 2s/epoch - 9ms/step
Epoch 9/30
191/191 - 2s

<keras.callbacks.History at 0x15428ddce20>

In [55]:
predictions = model.predict(train_padded)
predictions = [1 if p>0.5 else 0 for p in predictions]

In [56]:
print(train_sentences[10:20])
print(train_labels[10:20])
print(predictions[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar we arrived bago'
 'damage school bus 80 multi car crash breaking' 'whats man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[1 1 1 1 1 0 0 0 0 0]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
