In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D,MaxPooling1D,GRU
from keras.layers import Conv1D
from keras.layers import LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

from numpy import asarray
from numpy import zeros

import csv

## Prepare data

In [None]:
df_full = pd.read_csv("/content/drive/My Drive/ml/tweets_full.csv")
df_tweets_full = df_full[['tweet','label']]
X_train, X_test, y_train, y_test = train_test_split(df_tweets_full['tweet'], df_tweets_full['label'], test_size=0.05, random_state=42)
#create word dictionary
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)#convert each word to a integer based on the tokenizer
X_test = tokenizer.texts_to_sequences(X_test)
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) #makes sure all tweets have 100 words (padding)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

## Prepare embedding matrix

In [None]:
embeddings_dictionary = dict()
glove_file = open('/content/drive/My Drive/ml/w2v_full_w20_min4.txt', encoding="utf8") #create embedding based on file downloaded on https://nlp.stanford.edu/projects/glove/

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 200))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

## Bidirectional LSTM RNN

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

history = model.fit(X_train, y_train, batch_size=32, callbacks=[
                     ModelCheckpoint(
                         filepath='./RNN_bi_lstm_best_weights.hdf5',
                         monitor='val_acc',
                         verbose=1,
                         save_best_only=True,
                         mode='max')], epochs=6, verbose=1, validation_split=0.05)

score = model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

## Create prediction file

In [None]:
def create_csv_submission(y_pred, path):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               path (string name of .csv output file to be created)
    """
    ids=[i for i in range(1,len(y_pred)+1)]
    with open(path, 'w', newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})


In [None]:
to_predict = pd.read_csv("test_cleaned.csv")
to_predict.index += 1

to_predict = to_predict['tweet']
to_predict = to_predict.astype(str)

to_predict= tokenizer.texts_to_sequences(to_predict)

to_predict = pad_sequences(to_predict, padding='post', maxlen=maxlen)

result_test = model.predict(to_predict)
#it returns values between [0,1] (since sigmoid is used) 
result_test[result_test < 0.5] = -1 #replace values < 0.5 to -1
result_test[result_test >= 0.5] = 1

In [None]:
create_csv_submission(result_test,"submissions.csv")