In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.image as mpimg
from collections import Counter

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


Predict whether a given tweet is about a real distaster or not.

In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

In [6]:
vocabulary = Counter()
for tweet in train_data['text']:
    #tweet = tf.strings.regex_replace(tweet, b'[^a-zA-Z]', b' ')
    tweet = tf.strings.split(tweet)
    vocabulary.update(list(tweet.numpy()))

In [7]:
len(vocabulary.keys())

31924

It looks like we have 31924 words in our dictionary. Lets take the top 10,000 of them and used them in our model.

In [8]:
vocab_size = 10000
truncated_vocabulary = [ word for word, count in vocabulary.most_common()[:vocab_size]]

In [9]:
#preprocess to replace each word with its ID (index)
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init,num_oov_buckets)

In [10]:
table.lookup(tf.constant([b'this movie was faaaaaaantastic'.split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   24,   337,    21, 10770]], dtype=int64)>

In [11]:
def encode_words(X_batch, y_batch=0):
    return table.lookup(X_batch), y_batch

In [12]:
train_set = train_data[['text','target']]
target = train_set.pop('target')
target = target.values.reshape((7613, 1))
dataset = tf.data.Dataset.from_tensor_slices((train_set.values, target))
train_set_model1 = dataset.batch(32).map(encode_words).prefetch(1)

Lets make sure that the dataset converted correctly. We are going to iterate over the dataset and print it's features and target

In [13]:
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [b'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'], Target: [1]
Features: [b'Forest fire near La Ronge Sask. Canada'], Target: [1]
Features: [b"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"], Target: [1]
Features: [b'13,000 people receive #wildfires evacuation orders in California '], Target: [1]
Features: [b'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '], Target: [1]


In [14]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss= 'binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

trained_model1 = model.fit(train_set_model1, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [15]:
import tensorflow_hub as hub

model1 = keras.Sequential([
    hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1', dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 50)                48190600  
_________________________________________________________________
dense_1 (Dense)              (None, 128)               6528      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 48,197,257
Trainable params: 6,657
Non-trainable params: 48,190,600
_________________________________________________________________


In [17]:
batch_size = 32
train_set = dataset.batch(batch_size).prefetch(1)
tweet_model = model1.fit(train_set, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [100]:
converted_test_data = []
for val in test_data['text']:
    converted_test_data.append(table.lookup(tf.constant(val.split())))

In [101]:
t_data = tf.data.Dataset.from_tensor_slices(test_data['text'])

In [102]:
model_predictions = model.predict(t_data.map(encode_words).batch(32).prefetch(1))

In [87]:
preds = model1.predict(test_data['text'])

In [107]:
def val(arr):
    v = 0
    if arr >= .5:
        v = 1
    return v

In [21]:
results = []
for pred in preds:
    results.append(val(pred))

In [22]:
results = pd.DataFrame(results)
results['id'] = test_data['id'].astype('int32')
results.columns =['target','id']
results['id'] = pd.to_numeric(results['id'], errors='coerce')

In [23]:
results.to_csv(f'model_results.csv', index=False)

In [24]:
results['id'][1]

2

In [110]:
results_1 = []
for pred in model_predictions:
    for v in pred:
        results_1.append(val(v))

In [111]:
results_1 = pd.DataFrame(results_1)
results_1['id'] = test_data['id'].astype('int32')
results_1.columns = ['target','id']

In [112]:
results_1.to_csv(f'model1_results.csv', index=False)