<a href="https://colab.research.google.com/github/whaledarn/MMRVaccineResearch/blob/main/rnn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
from tensorflow import keras

import numpy as np
from numpy import array

import os

import matplotlib.pyplot as plt

import pandas as pd
import random

from preproc import tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics
from sklearn.metrics import f1_score

df = pd.read_excel(r'Tweets.xls')
df2 = pd.read_excel(r'Data.xlsx')
tweets = list(df['Field'])
tone = list(df2['tone/emotion'])
type = list(df2['type of message'])
attitude = list(df2['attitude toward vaccine'])
length_of_data = 2997
seed = 50

for i in range(0, length_of_data):
    tweets[i] = tokenize(tweets[i])

mapped = zip(tweets, tone, type, attitude)
mapped = list(mapped)

random.seed(seed)
random.shuffle(mapped)

train_tweets, train_tone, train_type, train_attitude = zip(*mapped[:2100])
validation_tweets, validation_tone, validation_type, validation_attitude = zip(*mapped[2100:2400])
testing_tweets, testing_tone, testing_type, testing_attitude = zip(*mapped[2400:])


#               vvvv change for different tasks
values = array(train_attitude)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
train_select = onehot_encoded

values = array(validation_attitude)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
valid_select = onehot_encoded

values = array(testing_attitude)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
testing_select = onehot_encoded



tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_tweets)
train_tweets = tokenizer.texts_to_sequences(train_tweets)
validation_tweets = tokenizer.texts_to_sequences(validation_tweets)
testing_tweets = tokenizer.texts_to_sequences(testing_tweets)

vocab_size = len(tokenizer.word_index) + 1

# A dictionary mapping words to an integer index
word_index = tokenizer.word_index

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

train_data = keras.preprocessing.sequence.pad_sequences(train_tweets,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=50)

valid_data = keras.preprocessing.sequence.pad_sequences(validation_tweets,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=50)
testing_data = keras.preprocessing.sequence.pad_sequences(testing_tweets,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=50)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [None]:
from keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D 

# looks at the top 10000 words
vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 200))
model.add(keras.layers.LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(keras.layers.Dense(64, activation='relu'))

# Dropout for regularization
model.add(keras.layers.Dropout(0.5))

model.add(keras.layers.Dense(3, activation="softmax"))


model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])


history = model.fit(train_data,
                    train_select,
                    epochs=15,
                    batch_size=64,
                    validation_data=(valid_data, valid_select),
                    verbose=0)

results = model.evaluate(testing_data, testing_select)

print(results)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 200)         2000000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                67840     
_________________________________________________________________
dense_12 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 195       
Total params: 2,072,195
Trainable params: 2,072,195
Non-trainable params: 0
_________________________________________________________________
[0.7338001903277546, 0.7721943]
