In [0]:
# https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!pip install ndjson

Collecting ndjson
  Downloading https://files.pythonhosted.org/packages/f3/03/7dce7f71bce783fae64015e74b123b5e26074e356401664051d6f2339e7c/ndjson-0.2.0-py2.py3-none-any.whl
Installing collected packages: ndjson
Successfully installed ndjson-0.2.0


In [4]:
import ndjson
from collections import Counter
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from sklearn.preprocessing import LabelBinarizer
from keras.models import Model
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.initializers import Constant
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import categorical_accuracy
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from numpy import array
from numpy import asarray
from numpy import zeros
from numpy import vectorize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import FrenchStemmer
import nltk
# packages settings
nltk.data.path.append('./')
nltk.download('stopwords', download_dir='./')
nltk.download('punkt', download_dir='./')
import re

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to ./...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to ./...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
stop_words = set(stopwords.words('french'))

In [0]:
def loadTweetsFromNDJson(filepath):
    f = open(filepath)
    content = f.read()
    return ndjson.loads(content)

def save(fileName, content):
    f = open(fileName, 'w')
    f.write(content)
    f.close()
    print("Wrote in {}".format(fileName))

def setSplitter(complete_dataset, train_quota, validation_quota, test_quota):
    splitting_dataset = complete_dataset.copy()

    relative_validation_quota = round(validation_quota / (1 - train_quota), 1)
    relative_test_quota = round(test_quota / (1 - train_quota - validation_quota), 1)

    train_data = splitting_dataset.sample(frac = validation_quota)

    # remove training_data from splitting_dataset
    splitting_dataset = splitting_dataset.drop(train_data.index)

    validation_data = splitting_dataset.sample(frac=relative_validation_quota)

    # remove validation_data from splitting_dataset
    splitting_dataset = splitting_dataset.drop(validation_data.index)        

    test_data = splitting_dataset.sample(frac=relative_test_quota)

    return (train_data, validation_data, test_data)

def toXY(dataframe_input):
    X = ()
    Y = ()

    lb = LabelBinarizer()

    raw_Y = dataframe_input['polarity']
    Y = lb.fit_transform(raw_Y)

    print(lb.classes_)

    X = dataframe_input.drop(columns=['polarity'])

    return (X['encoded_message'].tolist(), Y)

def loadGloVe(file_path):
    embeddings_index = dict()
    f = open(file_path, encoding='utf8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()
    print('Loaded %s word vectors' % len(embeddings_index))

    return embeddings_index


def createWeightMatrix(vocab_size, tokenizer, embeddings_index):
    embedding_matrix = zeros((vocab_size, 100))
    
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)

        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

def removeStopWordsFromMessage(dataframe):
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join([item for item in word_tokenize(x) if item not in stop_words]))
    return dataframe

def removeVariousTwitterElementsFromMessage(dataframe):
    regex_filter = "(@[a-zA-ZÀ-ÿ0-9]+)|(#[a-zA-ZÀ-ÿ0-9]+)"
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join(re.sub(regex_filter, ' ', x).split()))
    return dataframe

def removeUrlsFromMessage(dataframe):
    dataframe['message'] = dataframe['message'].apply(lambda x: re.split('https?:\/\/.*', str(x))[0])
    return dataframe

def messageStemming(dataframe):
    stemmer = FrenchStemmer()
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join([stemmer.stem(y) for y in word_tokenize(x)]))
    return dataframe

def getOnlyAlphaFromMessage(dataframe):
    dataframe['message'] = dataframe['message'].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.isalpha()]))
    return dataframe

def prepareDataframeMessage(dataframe_source):
    df = dataframe_source.copy()
    return (df
            .pipe(removeUrlsFromMessage)
            .pipe(removeVariousTwitterElementsFromMessage)
            .pipe(getOnlyAlphaFromMessage)
            .pipe(removeStopWordsFromMessage)
            .pipe(messageStemming)
            )

In [7]:
# load tweets from json (ndjson)
tweetsRecord = loadTweetsFromNDJson('./drive/My Drive/Cours/application_innovation/datasets/project_tp_annotated-hashtags-textblob.json')
#tweetsRecord = loadTweetsFromNDJson('./project_svm_annotated.json')

# load tweets in pandas dataframe
tweetsDataframe = pd.DataFrame(tweetsRecord)
# filter columns to use only message and polarity
tweetsDataframe = tweetsDataframe[['message', 'polarity']]

tweetsDataframe.describe()

Unnamed: 0,message,polarity
count,75159,75159
unique,48321,4
top,"""En parlant constamment de similarité entre prog FN & de la #FI , vous journalistes faites la campagne de M Lepen"" @RaquelGarridoFI #LeDebat",positif
freq,19,28374


In [8]:
tweetsDataframe = prepareDataframeMessage(tweetsDataframe)
tweetsDataframe.head()

Unnamed: 0,message,polarity
0,suicid prémed peur pouvoir respons pen just cruch vid,negatif
1,,autre
2,conclus journal madam lepen respect regl fix tout dit,positif
3,là marin écroul comm merd,negatif
4,élect gross mascarad fait vieux mond a fait tout survivr,positif


In [0]:
# Keras text tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweetsDataframe['message'].tolist())

In [10]:
longest_message_length = tweetsDataframe.message.str.len().max()
vocab_size = len(tokenizer.word_index) + 1
labels_size = len(tweetsDataframe['polarity'].unique())
print("Labels count : {}".format(labels_size))
print("Longest message length : {}".format(longest_message_length))
print("Vocabulary size : {}".format(vocab_size))

Labels count : 4
Longest message length : 114
Vocabulary size : 16643


In [0]:
encoded_messages_list = tokenizer.texts_to_sequences(tweetsDataframe['message'].tolist())
padded_messages_list = pad_sequences(
    encoded_messages_list, 
    maxlen=longest_message_length, 
    padding='post')

In [12]:
embeddings_index = loadGloVe('./drive/My Drive/Cours/application_innovation/glove.6B.100d.txt')
embeddings_matrix = createWeightMatrix(vocab_size, tokenizer, embeddings_index)

Loaded 400000 word vectors


In [13]:
# encode message column
tweetsDataframe['encoded_message'] = array(padded_messages_list).tolist()

tweetsDataframe = tweetsDataframe.drop(columns=['message'])

tweetsDataframe.head()

Unnamed: 0,polarity,encoded_message
0,negatif,"[608, 10551, 138, 219, 501, 7, 71, 3995, 143, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
1,autre,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
2,positif,"[87, 81, 57, 3, 97, 543, 1696, 13, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
3,negatif,"[53, 6, 3996, 20, 94, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
4,positif,"[279, 254, 1956, 10, 1092, 100, 2, 10, 13, 5518, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"


In [0]:
# Split the set

# 70% for training
training_set_percentage = 0.7
# 15% for validation
validation_set_percentage = 0.15
# 15% for evaluation
evaluation_set_percentage = 0.15

(training_set, validation_set, evaluation_set) = setSplitter(
    tweetsDataframe, 
    training_set_percentage, 
    validation_set_percentage,
    evaluation_set_percentage
    )

In [15]:
# Transform dataframe to X and Y values, to feed to the network
(X_training_set, Y_training_set) = toXY(training_set)
(X_validation_set, Y_validation_set) = toXY(validation_set)
(x_evaluation_set, Y_evaluation_set) = toXY(evaluation_set)

['autre' 'mixte' 'negatif' 'positif']
['autre' 'mixte' 'negatif' 'positif']
['autre' 'mixte' 'negatif' 'positif']


In [0]:
X_training_set = array(X_training_set)
X_validation_set = array(X_validation_set)

In [0]:
# ModelCheckPoint configuration
MODEL_SAVE_PATH = './sequential.hdf5'
modelCheckpointCallback = ModelCheckpoint(
    MODEL_SAVE_PATH,
    monitor='val_categorical_accuracy',
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode='max',
    period=1
)

In [18]:
# Define model
# https://keras.io/examples/pretrained_word_embeddings/
embedding_layer = Embedding(
    vocab_size,
    100,
    input_length=longest_message_length,
    embeddings_initializer=Constant(embeddings_matrix),
    trainable=False
)
sequence_input = Input(shape=(longest_message_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 3, activation='relu')(embedded_sequences)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(labels_size, activation='softmax')(x)
model = Model(sequence_input, preds)







In [19]:
# compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=[categorical_accuracy])
model.summary()



Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 114)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 114, 100)          1664300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 112, 128)          38528     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 37, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 35, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 11, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 9, 128)            49

In [20]:
# fit the model
model.fit(
    X_training_set, 
    Y_training_set, 
    epochs=10, 
    verbose=1, 
    validation_data=(X_validation_set, Y_validation_set),
    callbacks=[modelCheckpointCallback])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 11274 samples, validate on 31942 samples
Epoch 1/10






Epoch 00001: val_categorical_accuracy improved from -inf to 0.58018, saving model to ./sequential.hdf5
Epoch 2/10

Epoch 00002: val_categorical_accuracy improved from 0.58018 to 0.61950, saving model to ./sequential.hdf5
Epoch 3/10

Epoch 00003: val_categorical_accuracy improved from 0.61950 to 0.68753, saving model to ./sequential.hdf5
Epoch 4/10

Epoch 00004: val_categorical_accuracy did not improve from 0.68753
Epoch 5/10

Epoch 00005: val_categorical_accuracy did not improve from 0.68753
Epoch 6/10

Epoch 00006: val_categorical_accuracy did not improve from 0.68753
Epoch 7/10

Epoch 00007: val_categorical_accuracy did not improve from 0.68753
Epoch 8/10

Epoch 00008: val_categorical_accuracy did not improve from 0.68753
Epoch 9/10

Epoch 00009: val_categorical_accuracy did not improve from 0.68753
Epoch 10/10

Epoch 0001

<keras.callbacks.History at 0x7fe817e39f28>

In [21]:
# evaluate the model
best_model = load_model(MODEL_SAVE_PATH)
loss, accuracy = best_model.evaluate(X_validation_set, Y_validation_set, verbose=1)
print("Loss : {}".format(loss))
print("Accuracy : {}".format(accuracy))

Loss : 0.761925192902154
Accuracy : 0.6875273934042705


In [22]:
# ----------------------
# Predict on test data
# ----------------------
# --- Use same tokenizer than model

# load tweets from json (ndjson)
testTweetsRecord = loadTweetsFromNDJson('./drive/My Drive/Cours/application_innovation/datasets/test-euapv.json')

# load tweets in pandas dataframe
testDataframe = pd.DataFrame(testTweetsRecord)

# filter columns to use only message
testDataframe = testDataframe[['message']]

print(testDataframe.describe())

testDataframe = prepareDataframeMessage(testDataframe)

# Encode messages using previously fitted tokenizer
encoded_messages_list = tokenizer.texts_to_sequences(testDataframe['message'].tolist())
padded_messages_list = pad_sequences(
    encoded_messages_list, 
    maxlen=longest_message_length, 
    padding='post')

# Set new column in dataframe with endoded date
testDataframe['encoded_message'] = array(padded_messages_list).tolist()

testDataframe = testDataframe.drop(columns=['message'])

# Use encoded data from pandas dataframe
X_predict = array(testDataframe['encoded_message'].tolist())

Y = best_model.predict(X_predict, verbose=1)
classes = Y.argmax(axis=-1)
labels = ['autre', 'mixte', 'negatif', 'positif']
class_to_label = lambda t: labels[t]
vfunc = vectorize(class_to_label)
Y_labels = vfunc(classes)


out = './prediction.txt'
out_content = ''

for i in range(0, len(testTweetsRecord)):
    identifier = testTweetsRecord[i]['identifier']
    label = Y_labels[i]
    line = "{} {}\n".format(identifier, label)
    out_content += line

save(out, out_content)

                                                message
count   1714                                           
unique  1713                                           
top     "Macron c'est la France soumise !" #2017LeDebat
freq    2                                              
Wrote in ./prediction.txt
