In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.image as mpimg
from collections import Counter

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


Predict whether a given tweet is about a real distaster or not.

In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

In [6]:
def vocab_counter(text,key):
    vocabulary = Counter()
    for tweet in text[key]:
        tweet = tf.strings.split(tweet)
        vocabulary.update(list(tweet.numpy()))
    return vocabulary

def truncated_vocabulary(vocabulary,vocab_size,):
    truncated_vocabulary = [ word for word, count in vocabulary.most_common()[:vocab_size]]
    return truncated_vocabulary

def preprocess(truncated_vocabulary, num_oov_buckets):
    words = tf.constant(truncated_vocabulary)
    len_truncated_vocab = len(truncated_vocabulary)
    word_ids = tf.range(len_truncated_vocab, dtype=tf.int64)
    vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
    table = tf.lookup.StaticVocabularyTable(vocab_init,num_oov_buckets)
    return table

def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

def encode_test_words(X_batch, y_batch=None):
    return table.lookup(X_batch)

def train_target(train_data,text_label,target_label):
    train_set = train_data[[text_label,target_label]]
    target = train_set.pop(target_label)
    target = target.values.reshape(len(target),1)
    dataset = tf.data.Dataset.from_tensor_slices((train_set.values, target))
    train_set_model= dataset.batch(32).map(encode_words).prefetch(1)
    return dataset,train_set_model

def test(test_data, text_label):
    test_set = test_data[text_label]
    test_set = test_set.values.reshape(len(test_set),1)
    test_set = tf.data.Dataset.from_tensor_slices(test_data[text_label])
    test = test_set.batch(32).map(encode_test_words).prefetch(1)
    return test

In [7]:
vocab_size = 10000
num_oov_buckets = 1000
v = vocab_counter(train_data,'text')
t= truncated_vocabulary(v,num_oov_buckets)
table= preprocess(t,num_oov_buckets)
train_set_model = train_target(train_data,'text','target')



In [8]:
test_set = test(test_data,'text')
test_set

<PrefetchDataset shapes: (None,), types: tf.int64>

In [9]:
dataset = train_set_model[0]
train_set_model1 = train_set_model[1]

In [10]:
len(v.keys())

31924

It looks like we have 31924 words in our dictionary. Lets take the top 10,000 of them and used them in our model.

In [11]:
table.lookup(tf.constant([b'this movie was faaaaaaantastic'.split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[  24,  337,   21, 1770]])>

Lets make sure that the dataset converted correctly. We are going to iterate over the dataset and print it's features and target

In [12]:
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [b'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'], Target: [1]
Features: [b'Forest fire near La Ronge Sask. Canada'], Target: [1]
Features: [b"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"], Target: [1]
Features: [b'13,000 people receive #wildfires evacuation orders in California '], Target: [1]
Features: [b'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '], Target: [1]


In [128]:
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='loss',patience=3),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=0.0001)
]
adam_optimizer = keras.optimizers.Adam(learning_rate=0.001,
                                                 beta_1=0.9,
                                                 beta_2=0.999)

sgd_optimizer = keras.optimizers.SGD(lr=0.2, momentum=0.9, decay=0.01)


In [44]:
# og model
embed_size = 128
model1 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])
model1.compile(loss= 'binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

trained_model1 = model.fit(train_set_model1, epochs=25)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [131]:
embed_size = 128
model3 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None], mask_zero=True),
    keras.layers.SpatialDropout1D(0.2),
    keras.layers.Bidirectional(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2,return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2,)),
    keras.layers.Dense(128),
    keras.layers.Dense(1, activation='sigmoid')
])
model3.compile(loss= 'binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])

trained_model3 = model.fit(train_set_model1, epochs=25, callbacks= my_callbacks)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


In [35]:
model1_predictions = model1.predict(test_set)
model1_predictions
model3_predictions = model3.predict(test_set)
model3_predictions
#np.savetxt('predictions3'.csv, model3_predictions, delimeters=',')
#np.savetxt('predictions1'.csv, model1_predictions, delimeters=',')



array([[0.50071144],
       [0.49993077],
       [0.5001224 ],
       ...,
       [0.5001627 ],
       [0.49998587],
       [0.5001549 ]], dtype=float32)

In [39]:
import tensorflow_hub as hub

model4 = keras.Sequential([
    hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1', dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [41]:
# This is broken
batch_size = 32
#train_set = dataset.batch(batch_size).prefetch(1)
tweet_model = model.fit(train_set_model1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model4_predictions = model4.predict(test_set)
model4_predictions
#np.savetxt('predictions4'.csv, model4_predictions, delimeters=',')