# Loading data and tools

In [None]:
import json

# load labelled data from file
data = []
with open('/data/labelled_data_fr.json', 'r') as input_file: # file with data
    for line in input_file:
        input_data.append(line) 

In [None]:
import stanfordnlp # Natural Language Processing library https://stanfordnlp.github.io/stanfordnlp/
from spacy_stanfordnlp import StanfordNLPLanguage # library for lemmatization/normalization of text data

snlp = stanfordnlp.Pipeline(lang="fr")
nlp = StanfordNLPLanguage(snlp)

# Preprocessing of the tweets

In [None]:
import numpy as np
from nltk.probability import FreqDist

import re

corpus = [] # all words in collection
processed_content = [] # preprocessed messages
string_content = []

print('Corpus creation...')
for row in data:
    row = json.loads(row)['text']
    row = str(row)
    row = re.sub('((www\.[\s]+)|(https?://[^\s]+))', 'URL', row) # URL
    row = re.sub('\_', ' ', row) # _
    row = re.sub('\!', ' ATTENTION', row) # !
    row = re.sub('\?', ' QUESTION', row) # ?
    row = re.sub('@[A-Za-z0-9]+', ' ', row) # mentions
    row = re.sub('\W', ' ', row) # symbols
    row = re.sub('\_', ' ', row) # _
    row = re.sub('[\s]+', ' ', row) # spaces
    new_row = []
    doc = nlp(row.lower())
    string = ''
    for token in doc:
        new_row.append(token.lemma_)
        corpus.append(token.lemma_)
        string += str(token.lemma_) + ' '
    processed_content.append(new_row)
    string_content.append(string)
print("Finish")

# Creating corpus of tweets

In [None]:
fdist = FreqDist(corpus)

print('Stopwords filtering')
x = 1
fdist_dict = dict()
for word, frequency in fdist.most_common():
    if fdist[word] < len(processed_content):
        fdist_dict[word] = x
        x += 1

digit_content = []
for message in processed_content:
    message_meet = []
    for word in message:
        if word in fdist_dict:
            message_meet.append(fdist_dict[word])
    digit_content.append(message_meet)

content = np.array(digit_content)
content

# Evaluation of algorithms

In [None]:
# Evaluations
import tensorflow as tf
from keras import backend as K

# There is different metrics. We prefer to use AUC https://en.wikipedia.org/wiki/Receiver_operating_characteristic
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

# Embeddings load

In [None]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding


# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(string_content)

embeddings_index = dict()
f = open('resources/file') # you can download embeddings file from http://vectors.nlpl.eu/repository/ 
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocabulary_size, 300)) #this parameter depends on Vector size of embedding
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

# Deep learning algorithms evaluation (cross-validation)

In [None]:
#CNN + embeddings
from sklearn.model_selection import train_test_split
from statistics import mean

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

# set parameters
seed = 7
max_features = 5000
maxlen = 50
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 15


acc_list = []
score_list = []
auc_list = []
precision_list = []
recall_list =[]
x = 0

while x < 10:
    x += 1
    x_train, x_test, y_train, y_test = train_test_split(content, data.topic, test_size=0.85, random_state=seed)

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

    model = Sequential()
    model.add(Embedding(vocabulary_size, 300,  #this parameter depends on Vector size of embedding
                        input_length=50, weights=[embedding_matrix], trainable=False))
    model.add(Dropout(0.1))

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
    # we use max pooling:
    model.add(GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', auc, precision, recall])

    model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

    score, acc, auc_score, precision_score, recall_score = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
    
    acc_list.append(acc)
    score_list.append(score)
    auc_list.append(auc_score)
    precision_list.append(precision_score)
    recall_list.append(recall_score)

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

In [None]:
#LSTM + embeddings

from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

max_features = 5000
# cut texts after this number of words (among top max_features most common words)
maxlen = 50
batch_size = 32
seed = 7

acc_list = []
score_list = []
auc_list = []
precision_list = []
recall_list =[]
x = 0

while x < 10:
    x += 1
    x_train, x_test, y_train, y_test = train_test_split(content, data.topic, test_size=0.2, random_state=seed)

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

    model = Sequential()
    model.add(Embedding(vocabulary_size, 300, #this parameter depends on Vector size of embedding
                        input_length=50, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', auc, precision, recall])

    model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test, y_test))
    score, acc, auc_score, precision_score, recall_score = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
    
    acc_list.append(acc)
    score_list.append(score)
    auc_list.append(auc_score)
    precision_list.append(precision_score)
    recall_list.append(recall_score)

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

# Machine learning algorithms evaluation (cross-validation)

In [None]:
# Preprocessing modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn import metrics

In [None]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, data.topic, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape

    #RandomForest
    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=20, max_features="auto", random_state=42))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)
    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

In [None]:
#Stochastic Gradient Descent

from sklearn.linear_model import SGDClassifier

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, data.topic, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape
    
    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge',alpha=1e-3, random_state=42, max_iter=100, tol=None))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)

    
    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

In [None]:
#ExtraTrees
from sklearn.ensemble import ExtraTreesClassifier

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, data.topic, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape

    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)
    
    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

In [None]:
#LinearSVM
from sklearn.svm import LinearSVC

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, data.topic, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape

    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=42, tol=0.0001, verbose=0))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)

    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))