In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Dropout, LSTM, Embedding, Conv1D, Flatten, MaxPooling1D, GRU
from keras.models import Sequential
from keras.optimizers import Adam

In [0]:
def build_model_cnn_lstm(X,
                         y,
                         vocabulary_size,
                         max_length,
                         callbacks_list=None,
                         Embedding_size=200,
                         batch_size=16384,
                         validation_split=0.04,
                         epochs=100):
    """
    Create the model for a Convolutional Neural Network with a Long Short-Term Memory Network
    INPUT:
        X : Multidimensional list - The traning features
        y : list                  - The traning results
        callbacks_list :          - The callback options for the model
        Embedding_size            - The size of the embedding
        batch_size                - The size of the batch in the neural network
        validation_split          - The validation_test split
        epochs                    - The number of epochs
    OUTPUT:
        Returns the model trained and the history of the training
    """
    print('Using Convolutional Neural Network with a Long Short-Term Memory Network')

    model_conv = Sequential()
    model_conv.add(
        Embedding(vocabulary_size, Embedding_size, input_length=max_length))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(Embedding_size))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(
        loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model_conv.summary()

    history_conv = model_conv.fit(
        X,
        y,
        validation_split=validation_split,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks_list)

    return model_conv, history_conv

In [0]:
#train_set = pd.read_pickle("./data/tweets.pkl")
train_set = pd.read_pickle("/content/drive/My Drive/EPFL/Machine Learning/MA1/tweets.pkl")
# Shuffle the data to mix the positives and negatives
train_set = train_set.sample(frac=1, random_state=1).reset_index(drop=True)
tweets = list(train_set['tweet'].values)

In [0]:
# Tokenizing tweets
max_length = 32
vocabulary_size = 100000
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
X = pad_sequences(sequences, maxlen=max_length)
y = (train_set['label'].values)

In [30]:
# build_model_cnn_lstm(X,y,vocabulary_size,max_length,
#                      callbacks_list=[
#                          ModelCheckpoint(
#                              filepath='CNN_LSTM_best_weights.hdf5',
#                              monitor='val_acc',
#                              verbose=1,
#                              save_best_only=True,
#                              mode='max'),
#                          EarlyStopping(
#                              monitor='val_acc', patience=3, mode='max')
#                      ],
#                      Embedding_size=200,batch_size=16384,validation_split=0.3,epochs=100)

build_model_cnn_lstm(X,y,vocabulary_size,max_length, Embedding_size=200,batch_size=16384,validation_split=0.3,epochs=100)

Using Convolutional Neural Network with a Long Short-Term Memory Network
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 32, 200)           20000000  
_________________________________________________________________
dropout_8 (Dropout)          (None, 32, 200)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 28, 64)            64064     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 7, 64)             0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 200)               212000    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 201       
Total params: 20,276,265
Trainable params: 20,2

(<keras.engine.sequential.Sequential at 0x7f81beff16d8>,
 <keras.callbacks.History at 0x7f847bb2c9b0>)