In [49]:
import tweets_processor
import numpy as np
import keras
import mlflow.keras
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.backend import expand_dims
from sklearn.model_selection import train_test_split

In [61]:
# load the data
# get the tweets and the region labels from csv file
tweets_text, tweets_regions = tweets_processor.get_tweets_from_csv()

In [62]:
# preprocess the tweets
processed_tweets = []
for tweet in tweets_text:
    processed_tweet = tweets_processor.preprocessor(tweet)
    processed_tweets.append(processed_tweet)

In [63]:
# convert label regions to integers
encoded_labels = tweets_processor.encode_labels(tweets_regions)

In [64]:
# split the data into train and test
train_data, test_data, train_labels, test_labels = train_test_split(processed_tweets, encoded_labels, test_size=0.33, random_state=0)

In [65]:
# create the tokenizer at word level
# t = Tokenizer()
# create the tokenizer at character level
t = Tokenizer(char_level=True)
t.fit_on_texts(train_data)

In [66]:
# get the vocab size
vocab_size = len(t.word_counts) + 1

In [67]:
# maximum input sequence length
max_len = 100 # hyperparameter

# convert the train data to sequence of id's
encoded_train_data = t.texts_to_sequences(train_data)

# make inputs of same length by using pad_sequences
padded_train_data = pad_sequences(encoded_train_data,padding='post', maxlen=max_len)
# conver to numpy array
final_train_data = np.array(padded_train_data)

In [68]:
# convert the test data to sequence of id's
encoded_test_data = t.texts_to_sequences(test_data)

# make inputs of same length by using pad_sequences
padded_test_data = pad_sequences(encoded_test_data,padding='post', maxlen=max_len) # should maxlen be same for test and train?
# convert to numpy array
final_test_data = np.array(padded_test_data)

In [69]:
# convert the integers to categorical labels for train set
# there are 23 regions as labels
x_train = keras.utils.to_categorical(train_labels, num_classes=23)
x_train[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.], dtype=float32)

In [70]:
# convert the integers to categorical labels for test set
y_test = keras.utils.to_categorical(test_labels, num_classes=23)

In [71]:
# create model with keras
model = Sequential()

with mlflow.start_run():
    # picking an arbitary value for the output_dim
    embedding_features = 20 # hyperparameter to tune
    first_filter_num = 32
    first_conv_len = 3
    first_pool_size = 2
    second_filter_num = 64
    second_conv_len = 3
    second_pool_size = 2
    first_dense_dim = 128
    dropout = 0.5
    
    #create embedding for vocab size
    model.add(Embedding(vocab_size, output_dim=embedding_features, input_length=max_len)) # had to add input length for flattening later
    model.add(Conv1D(first_filter_num, first_conv_len, activation='relu'))
    model.add(Conv1D(first_filter_num, first_conv_len, activation='relu'))
    model.add(MaxPooling1D(first_pool_size))
    model.add(Conv1D(second_filter_num, second_conv_len, activation='relu'))
    model.add(Conv1D(second_filter_num, second_conv_len, activation='relu'))
    model.add(MaxPooling1D(second_pool_size))
    #model.add(Dropout(dropout))
    
    model.add(Flatten())
    model.add(Dense(first_dense_dim, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(len(tweets_processor.regions_mapping), activation='softmax')) # using our 23 regions
    
    model.compile(loss='categorical_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])
    
    batch_size = 32
    epochs = 10
    model.fit(final_train_data, x_train, batch_size=batch_size, epochs=epochs)
    
    score = model.evaluate(final_test_data, y_test)
    print(score)
    
    #mlflow logs
    mlflow.log_param("max_input_len", max_len)
    mlflow.log_param("embedding_features", embedding_features)
    mlflow.log_param("first_filter_num", first_filter_num)
    mlflow.log_param("first_conv_len", first_conv_len)
    mlflow.log_param("first_pool_size", first_pool_size)
    mlflow.log_param("second_filter_num", second_filter_num)
    mlflow.log_param("second_conv_len", second_conv_len)
    mlflow.log_param("second_pool_size", second_pool_size)
    mlflow.log_param("first_dense_dim", first_dense_dim)
    mlflow.log_param("dropout", dropout)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("epochs", epochs)
    mlflow.log_metric("evaluation_loss", score[0])
    mlflow.log_metric("evaluation_accuracy", score[1])
    mlflow.log_param("level", 'character_level')
    #mlflow.log_param("activation", activation) # default for now tanh
    
    mlflow.keras.log_model(model, "cnn_models")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[2.866049933000044, 0.13272727272727272]
