In [0]:
# https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install ndjson



In [0]:
import ndjson
from collections import Counter
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from sklearn.preprocessing import LabelBinarizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import categorical_accuracy
from keras.callbacks import ModelCheckpoint
from numpy import array
from numpy import asarray
from numpy import zeros
from numpy import vectorize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import FrenchStemmer
import nltk
# packages settings
nltk.data.path.append('./')
nltk.download('stopwords', download_dir='./')
nltk.download('punkt', download_dir='./')
import re

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to ./...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to ./...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
stop_words = set(stopwords.words('french'))

In [0]:
def loadTweetsFromNDJson(filepath):
    f = open(filepath)
    content = f.read()
    return ndjson.loads(content)

def save(fileName, content):
    f = open(fileName, 'w')
    f.write(content)
    f.close()
    print("Wrote in {}".format(fileName))

def setSplitter(complete_dataset, train_quota, validation_quota, test_quota):
    splitting_dataset = complete_dataset.copy()

    relative_validation_quota = round(validation_quota / (1 - train_quota), 1)
    relative_test_quota = round(test_quota / (1 - train_quota - validation_quota), 1)

    train_data = splitting_dataset.sample(frac = validation_quota)

    # remove training_data from splitting_dataset
    splitting_dataset = splitting_dataset.drop(train_data.index)

    validation_data = splitting_dataset.sample(frac=relative_validation_quota)

    # remove validation_data from splitting_dataset
    splitting_dataset = splitting_dataset.drop(validation_data.index)        

    test_data = splitting_dataset.sample(frac=relative_test_quota)

    return (train_data, validation_data, test_data)

def toXY(dataframe_input):
    X = ()
    Y = ()

    lb = LabelBinarizer()

    raw_Y = dataframe_input['polarity']
    Y = lb.fit_transform(raw_Y)

    print(lb.classes_)

    X = dataframe_input.drop(columns=['polarity'])

    return (X['encoded_message'].tolist(), Y)

def loadGloVe(file_path):
    embeddings_index = dict()
    f = open(file_path, encoding='utf8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()
    print('Loaded %s word vectors' % len(embeddings_index))

    return embeddings_index


def createWeightMatrix(vocab_size, tokenizer, embeddings_index):
    embedding_matrix = zeros((vocab_size, 100))
    
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)

        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix


def removeStopWordsFromMessage(dataframe):
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join([item for item in word_tokenize(x) if item not in stop_words]))
    return dataframe

def removeVariousTwitterElementsFromMessage(dataframe):
    regex_filter = "(@[a-zA-ZÀ-ÿ0-9]+)|(#[a-zA-ZÀ-ÿ0-9]+)"
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join(re.sub(regex_filter, ' ', x).split()))
    return dataframe

def removeUrlsFromMessage(dataframe):
    dataframe['message'] = dataframe['message'].apply(lambda x: re.split('https?:\/\/.*', str(x))[0])
    return dataframe

def messageStemming(dataframe):
    stemmer = FrenchStemmer()
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join([stemmer.stem(y) for y in word_tokenize(x)]))
    return dataframe

def getOnlyAlphaFromMessage(dataframe):
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.isalpha()]))
    return dataframe

def prepareDataframeMessage(dataframe_source):
    df = dataframe_source.copy()
    return (df
            .pipe(removeUrlsFromMessage)
            .pipe(removeVariousTwitterElementsFromMessage)
            .pipe(getOnlyAlphaFromMessage)
            .pipe(removeStopWordsFromMessage)
            .pipe(messageStemming)
            )

In [0]:
# load tweets from json (ndjson)
tweetsRecord = loadTweetsFromNDJson('./drive/My Drive/Cours/application_innovation/datasets/project_annotated-hashtags-textblob.json')
#tweetsRecord = loadTweetsFromNDJson('./project_svm_annotated.json')

# load tweets in pandas dataframe
tweetsDataframe = pd.DataFrame(tweetsRecord)
# filter columns to use only message and polarity
tweetsDataframe = tweetsDataframe[['message', 'polarity']]

print(tweetsDataframe.head())
print(tweetsDataframe.describe())

                                                                                                                                       message polarity
0  #2017LeDebat C'était un suicide prémédité par peur du pouvoir et des responsabilités, ou Le pen est juste une cruche vide ?                  negatif
1  ET L'ÉCOLOGIE ? #2017LeDebat                                                                                                                 autre  
2  Conclusion du journaliste : "Madame LePen, vous ne respectez même pas les règles que vous avez vous-même fixées". Tout est dit #2017LeDebat  positif
3  "Et là Marine s'est écroulée comme une merde" #2017LeDebat  #debat2017  #2017LeDébat  #LeGrandDebat … https://t.co/Pk9LWRmZmF                negatif
4  Ces élections c'est une grosse mascarade en fait, un vieux monde a l'agonnie qui fait tout pour survivre #2017LeDebat #rendeznousmelenchon   positif
                                                                                        

In [0]:
tweetsDataframe = prepareDataframeMessage(tweetsDataframe)
tweetsDataframe.head()

Unnamed: 0,message,polarity
0,suicid prémed peur pouvoir respons pen just cruch vid,negatif
1,,autre
2,conclus journal madam lepen respect regl fix tout dit,positif
3,là marin écroul comm merd,negatif
4,élect gross mascarad fait vieux mond a fait tout survivr,positif


In [0]:
# Keras text tokenizer
tokenizer = Tokenizer()

tokenizer.fit_on_texts(tweetsDataframe['message'].tolist())

In [0]:
longest_message_length = tweetsDataframe.message.str.len().max()
#words_list = Counter()
#tweetsDataframe['message'].str.lower().str.split().apply(words_list.update)

#vocab_size = len(words_list.items())
vocab_size = len(tokenizer.word_index) + 1
labels_size = len(tweetsDataframe['polarity'].unique())
print("Labels count : {}".format(labels_size))
print("Longest message length : {}".format(longest_message_length))
print("Vocabulary size : {}".format(vocab_size))

Labels count : 4
Longest message length : 111
Vocabulary size : 14572


In [0]:
encoded_messages_list = tokenizer.texts_to_sequences(tweetsDataframe['message'].tolist())
padded_messages_list = pad_sequences(
    encoded_messages_list, 
    maxlen=longest_message_length, 
    padding='post')

In [0]:
embeddings_index = loadGloVe('./drive/My Drive/Cours/application_innovation/glove.6B.100d.txt')
embeddings_matrix = createWeightMatrix(vocab_size, tokenizer, embeddings_index)

Loaded 400000 word vectors


In [0]:
# encode message column
# tweetsDataframe['encoded_message'] = tweetsDataframe['message'].apply(lambda x: one_hot(x, vocab_size))
# tweetsDataframe['encoded_message'] = pad_sequences(tweetsDataframe['encoded_message'], maxlen=longest_message_length, padding='post').tolist()

tweetsDataframe['encoded_message'] = array(padded_messages_list).tolist()

tweetsDataframe = tweetsDataframe.drop(columns=['message'])

tweetsDataframe.head()


Unnamed: 0,polarity,encoded_message
0,negatif,"[559, 9645, 141, 234, 572, 7, 70, 3660, 135, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
1,autre,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
2,positif,"[82, 80, 51, 3, 94, 517, 1731, 13, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
3,negatif,"[55, 6, 3661, 22, 89, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
4,positif,"[274, 250, 1809, 9, 1068, 108, 2, 9, 13, 5055, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"


In [0]:
# Split the set

# 70% for training
training_set_percentage = 0.7
# 15% for validation
validation_set_percentage = 0.15
# 15% for evaluation
evaluation_set_percentage = 0.15

(training_set, validation_set, evaluation_set) = setSplitter(
    tweetsDataframe, 
    training_set_percentage, 
    validation_set_percentage,
    evaluation_set_percentage
    )

In [0]:
# Transform dataframe to X and Y values, to feed to the network
(X_training_set, Y_training_set) = toXY(training_set)
(X_validation_set, Y_validation_set) = toXY(validation_set)
(x_evaluation_set, Y_evaluation_set) = toXY(evaluation_set)

['autre' 'mixte' 'negatif' 'positif']
['autre' 'mixte' 'negatif' 'positif']
['autre' 'mixte' 'negatif' 'positif']


In [0]:
X_training_set = array(X_training_set)
X_validation_set = array(X_validation_set)

In [0]:
# Define model
model = Sequential()
model.add(Embedding(
    vocab_size, 
    100, 
    input_length=longest_message_length,
    weights=[embeddings_matrix],
    trainable=False))
model.add(Flatten())
#model.add(Dense(labels_size, activation='relu'))
model.add(Dense(labels_size, activation='softmax'))












In [0]:
# ModelCheckPoint configuration
MODEL_SAVE_PATH = './sequential.hdf5'
modelCheckpointCallback = ModelCheckpoint(
    MODEL_SAVE_PATH,
    monitor='val_categorical_accuracy',
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode='max',
    period=1
)

In [0]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[categorical_accuracy])
model.summary()



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 111, 100)          1457200   
_________________________________________________________________
flatten_1 (Flatten)          (None, 11100)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 44404     
Total params: 1,501,604
Trainable params: 44,404
Non-trainable params: 1,457,200
_________________________________________________________________


In [0]:
# fit the model
model.fit(
    X_training_set, 
    Y_training_set, 
    epochs=10, 
    verbose=1, 
    validation_data=(X_validation_set, Y_validation_set),
    callbacks=[modelCheckpointCallback])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 10542 samples, validate on 29868 samples
Epoch 1/10

Epoch 00001: val_categorical_accuracy improved from -inf to 0.53529, saving model to ./sequential.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.53529 to 0.55370, saving model to ./sequential.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.55370 to 0.55648, saving model to ./sequential.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy improved from 0.55648 to 0.56428, saving model to ./sequential.hdf5
Epoch 5/10

Epoch 00005: val_categorical_accuracy improved from 0.56428 to 0.57024, saving model to ./sequential.hdf5
Epoch 6/10

Epoch 00006: val_categorical_accuracy improved from 0.57024 to 0.57044, saving model to ./sequential.hdf5
Epoch 7/10

Epoch 00007: val_categorical_accuracy improved from 0.57044 to 0.57158, saving model to ./sequential.hdf5
Epoch 8/10

Epoch 00008: val_ca

<keras.callbacks.History at 0x7f9882980748>

In [0]:
# evaluate the model
loss, accuracy = model.evaluate(X_validation_set, Y_validation_set, verbose=1)
print("Loss : {}".format(loss))
print("Accuracy : {}".format(accuracy))

Loss : 0.9704706542332584
Accuracy : 0.5760345520209449


In [0]:
# ----------------------
# Predict on test data
# ----------------------
# --- Use same tokenizer than model

# load tweets from json (ndjson)
testTweetsRecord = loadTweetsFromNDJson('./drive/My Drive/Cours/application_innovation/datasets/test-euapv.json')

# load tweets in pandas dataframe
testDataframe = pd.DataFrame(testTweetsRecord)

# filter columns to use only message
testDataframe = testDataframe[['message']]

print(testDataframe.describe())

testDataframe = prepareDataframeMessage(testDataframe)

# Encode messages using previously fitted tokenizer
encoded_messages_list = tokenizer.texts_to_sequences(testDataframe['message'].tolist())
padded_messages_list = pad_sequences(
    encoded_messages_list, 
    maxlen=longest_message_length, 
    padding='post')

# Set new column in dataframe with endoded date
testDataframe['encoded_message'] = array(padded_messages_list).tolist()

testDataframe = testDataframe.drop(columns=['message'])

# Use encoded data from pandas dataframe
X_predict = array(testDataframe['encoded_message'].tolist())

Y = model.predict(X_predict, verbose=1)
classes = Y.argmax(axis=-1)
labels = ['mixte', 'negatif', 'autre', 'positif'] # from tweets_cnn.ipnyb (model training)
class_to_label = lambda t: labels[t]
vfunc = vectorize(class_to_label)
Y_labels = vfunc(classes)


out = './prediction.txt'
out_content = ''

for i in range(0, len(testTweetsRecord)):
    identifier = testTweetsRecord[i]['identifier']
    label = Y_labels[i]
    line = "{} {}\n".format(identifier, label)
    out_content += line

save(out, out_content)

                                                message
count   1714                                           
unique  1713                                           
top     "Macron c'est la France soumise !" #2017LeDebat
freq    2                                              
Wrote in ./prediction.txt
