In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [None]:
from utils import sample_data

# -- Get the data -- #
NB_SAMPLES = 1000
TRAIN_SAMPLE = Path("../data/samples/sample_{}_train.csv".format(NB_SAMPLES))
VALID_SAMPLE = Path("../data/samples/sample_{}_validation.csv".format(NB_SAMPLES))
TRAIN_SAMPLE = pd.read_csv(TRAIN_SAMPLE).to_numpy()
VALID_SAMPLE = pd.read_csv(VALID_SAMPLE).to_numpy()


# -- Clean the data -- #
from utils.clean_data import clean_data
TRAIN_SAMPLE = clean_data(TRAIN_SAMPLE)
VALID_SAMPLE = clean_data(VALID_SAMPLE)

In [None]:
print(TRAIN_SAMPLE)

# Pre-processing
#### TWEET_ORIGINALS : List of the tweets : Array, shape = (len(nb_tweets))
#### TWEET_STRINGS : List of the list of the word of each tweet : List of list of string
#### TWEET_SCALARS : List of the description of each tweet : Array, shape = (len(nb_tweets), sentence_size * word_size)
#### IMPORTANT_WORDS : List of the label of each tweet : Array, shape = (len(nb_tweets), sentence_size)

In [None]:
# -- Parameters -- #
WORD_SIZE = 50  # 50 or 100 or 200 or 300
FILL_WITH = 0  # If a word is not in the dictionary, [0, ..., 0] will describe it.
SENTIMENT_WEIGHT = 1  # Multiply the sentiment by a factor
SENTENCE_SIZE = 50  # What ever
OPTIONS = [WORD_SIZE, SENTENCE_SIZE, FILL_WITH, SENTIMENT_WEIGHT]


# -- Get the original tweets -- #
TWEET_ORIGINALS_TRAIN = TRAIN_SAMPLE[:, 1]
TWEET_ORIGINALS_VALID = VALID_SAMPLE[:, 1]
print("First tweet :")
print(TWEET_ORIGINALS_TRAIN[0])
print("Shape of TWEET_ORIGINAL :", TWEET_ORIGINALS_TRAIN.shape)

In [None]:
from descriptors.tweet_string.create_strings import create_strings
from descriptors.tokenizer.tokenizer import Tokenizer

# Initialize the tokenizer
TOKENIZER = Tokenizer()

# -- Get the decomposition of the tweets -- #
TWEET_STRINGS_TRAIN = create_strings(TWEET_ORIGINALS_TRAIN, TOKENIZER, SENTENCE_SIZE)
TWEET_STRINGS_VALID = create_strings(TWEET_ORIGINALS_VALID, TOKENIZER, SENTENCE_SIZE)
print("Decomposition of the first tweet :")
print(TWEET_STRINGS_TRAIN[0])
print("Length of TWEET_STRING :", len(TWEET_STRINGS_TRAIN))

In [None]:
from descriptors.descriptor_glove.descriptor_glove import tweet_scalar_glove
from utils.standardize import standardize


# Get the dictionary
PATH_DICTIONARY = Path("../data/glove_descriptor/glove.6B.{}d.txt".format(WORD_SIZE))
# PATH_DICTIONARY = Path("../data/glove_descriptor/sample_test.txt")
DICTIONARY = pd.read_csv(PATH_DICTIONARY, sep=" ", header=None)

# Additional dictionary
ADDITIONAL_DIC = {"..": "...", "<3": "love"}

# Get the sentiments
SENTIMENTS_TRAIN = TRAIN_SAMPLE[:, -1]
SENTIMENTS_VALID = VALID_SAMPLE[:, -1]

# -- Get the decriptions of each tweets -- #
TWEET_SCALARS_TRAIN = tweet_scalar_glove(TWEET_STRINGS_TRAIN, SENTIMENTS_TRAIN, DICTIONARY, ADDITIONAL_DIC, OPTIONS)
TWEET_SCALARS_VALID = tweet_scalar_glove(TWEET_STRINGS_VALID, SENTIMENTS_VALID, DICTIONARY, ADDITIONAL_DIC, OPTIONS)

# Standardize the tweet descriptions
standardize(TWEET_SCALARS_TRAIN)
standardize(TWEET_SCALARS_VALID)

print("Description of the first tweet :")
print(TWEET_SCALARS_TRAIN[0])
print("Shape of TWEET_SCLALAR :", TWEET_SCALARS_TRAIN.shape)
print(TWEET_SCALARS_VALID.shape)

In [None]:
from descriptors.tweet_label.create_labels import create_labels

# Create the decompositions of the labels
LABEL_ORIGINALS_TRAIN = TRAIN_SAMPLE[:, 2]
LABEL_ORIGINALS_VALID = VALID_SAMPLE[:, 2]
LABEL_STRINGS_TRAIN = create_strings(LABEL_ORIGINALS_TRAIN, TOKENIZER, SENTENCE_SIZE)
LABEL_STRINGS_VALID = create_strings(LABEL_ORIGINALS_VALID, TOKENIZER, SENTENCE_SIZE)

# -- Get the labels -- #
IMPORTANT_WORDS_TRAIN = create_labels(TWEET_STRINGS_TRAIN, LABEL_STRINGS_TRAIN, SENTENCE_SIZE)
IMPORTANT_WORDS_VALID = create_labels(TWEET_STRINGS_VALID, LABEL_STRINGS_VALID, SENTENCE_SIZE)

IDX = 5
print(TWEET_ORIGINALS_TRAIN[IDX])
print(LABEL_ORIGINALS_TRAIN[IDX])
print("Labels :")
print(IMPORTANT_WORDS_TRAIN[IDX])
print("Shape of IMPORTANT_WORDS :", IMPORTANT_WORDS_TRAIN.shape)

# Classification

In [None]:
from tensorflow.python.keras.losses import binary_crossentropy
from classifiers.classifier_mlp.classifier_mlp import ClassifierConv, ClassifierDense
from utils.post_processing import preds_to_strings
from utils.loss import mean_jaccard

#### All the sentiments in the same Classifier

In [None]:
# --- Training --- #
# Parameters
NB_EPOCHS = 2
BATCH_SIZE = 20
CLASS_WEIGHT = np.sum(IMPORTANT_WORDS_TRAIN, axis=0) / len(IMPORTANT_WORDS_TRAIN)

# Validation data
VALID_DATA = (TWEET_SCALARS_VALID, IMPORTANT_WORDS_VALID)

# The classifier
CLASSIFIER = ClassifierDense(WORD_SIZE, SENTENCE_SIZE)

# Compile the classifier
CLASSIFIER.compile(optimizer='adam', loss=binary_crossentropy, metrics=['accuracy'])

CLASSIFIER.fit(TWEET_SCALARS_TRAIN, IMPORTANT_WORDS_TRAIN, batch_size=BATCH_SIZE, epochs=NB_EPOCHS, 
                     validation_data=VALID_DATA, class_weight=CLASS_WEIGHT)

In [None]:
# --- Testing --- #
# Parameters
THRESHOLD_MAX = 0.01
THRESHOLD_MIN = 0
NB_THRESHOLDS = 30

THRESHOLD_OPT = 1
JACCARD_ACC_MAX = 0
JACCARD_LIST = []

# Predictions
CLASSIFIER_CONV.trainable = False
PREDICTIONS = CLASSIFIER.predict(TWEET_SCALARS_VALID)
print(PREDICTIONS)

for threshold in np.linspace(THRESHOLD_MIN, THRESHOLD_MAX, NB_THRESHOLDS):
    # Get the predicitions with the threshold
    PRED_THRESHOLD = PREDICTIONS > threshold
    print(PRED_THRESHOLD[0])
    
    # Get the string predictions
    PRED_STRING = preds_to_strings(TWEET_ORIGINALS_VALID, TWEET_STRINGS_VALID, PRED_THRESHOLD)

    # Compute the loss
    JACCARD_ACC = mean_jaccard(LABEL_ORIGINALS_VALID, PRED_STRING)
    
    # Print results
    print("Jaccard score", JACCARD_ACC)
    print("Threshold", threshold)
    print("\n")
        
    # Updates
    JACCARD_LIST.append(JACCARD_ACC)
    if JACCARD_ACC > JACCARD_ACC_MAX:
        JACCARD_ACC_MAX = JACCARD_ACC
        THRESHOLD_OPT = threshold

In [None]:
plt.plot(JACCARD_LIST)
plt.xlabel("threshold")
plt.ylabel("Jaccard score")
plt.savefig("../results/mlp_all.jpg")
plt.show()

In [None]:
# --- Save the weights --- #
# Path the save the weights
PATH_SAVE_WEIGHTS = Path("../weights/nb_samples_{}_nb_thresholds_{}.h5".format(NB_SAMPLES, NB_THRESHOLDS))

# Save the weights
CLASSIFIER.save_weights(str(PATH_SAVE_WEIGHTS))

#### Positive and negative sentiment together

In [None]:
# --- Training --- #
# Parameters
NB_EPOCHS_POS_NEG = 3
BATCH_SIZE_POS_NEG = 20

# Select the positive and the negative sentiments
TRAIN_SELECTION_POS = np.where(TWEET_SCALARS_TRAIN[:, -1] == 1)[0]
TRAIN_SELECTION_NEG = np.where(TWEET_SCALARS_TRAIN[:, -1] == -1)[0]
TRAIN_SELECTION_POS_NEG = np.concatenate((TRAIN_SELECTION_POS, TRAIN_SELECTION_NEG))
# Select and withdraw the sentiment
TWEET_SCALARS_TRAIN_SELECT = TWEET_SCALARS_TRAIN[TRAIN_SELECTION_POS_NEG][:, : -1]
IMPORTANT_WORDS_TRAIN_POS_NEG = IMPORTANT_WORDS_TRAIN[TRAIN_SELECTION_POS_NEG]

VALID_SELECTION_POS = np.where(TWEET_SCALARS_VALID[:, -1] == 1)[0]
VALID_SELECTION_NEG = np.where(TWEET_SCALARS_VALID[:, -1] == -1)[0]
VALID_SELECTION_POS_NEG = np.concatenate((VALID_SELECTION_POS, VALID_SELECTION_NEG))
# Select and withdraw the sentiment
TWEET_SCALARS_VALID_SELECT = TWEET_SCALARS_VALID[VALID_SELECTION_POS_NEG][:, : -1]
IMPORTANT_WORDS_VALID_POS_NEG = IMPORTANT_WORDS_VALID[VALID_SELECTION_POS_NEG]
LABEL_ORIGINALS_VALID_POS_NEG = LABEL_ORIGINALS_VALID[VALID_SELECTION_POS_NEG]
TWEET_ORIGINALS_VALID_POS_NEG = TWEET_ORIGINALS_VALID[VALID_SELECTION_POS_NEG]
TWEET_STRINGS_VALID_POS_NEG = np.array(TWEET_STRINGS_VALID, dtype=object)[VALID_SELECTION_POS_NEG]

# Modify the data to be adapted to the convolution
TWEET_SCALARS_TRAIN_POS_NEG = np.reshape(TWEET_SCALARS_TRAIN_SELECT, (len(TWEET_SCALARS_TRAIN_SELECT), SENTENCE_SIZE, WORD_SIZE))
TWEET_SCALARS_VALID_POS_NEG = np.reshape(TWEET_SCALARS_VALID_SELECT, (len(TWEET_SCALARS_VALID_SELECT), SENTENCE_SIZE, WORD_SIZE))

# Validation data
VALID_DATA_POS_NEG = (TWEET_SCALARS_VALID_POS_NEG, IMPORTANT_WORDS_VALID_POS_NEG)

# Set the class weight
CLASS_WEIGHT_POS_NEG = np.sum(IMPORTANT_WORDS_TRAIN_POS_NEG, axis=0) / len(IMPORTANT_WORDS_TRAIN_POS_NEG)

# The classifier
CLASSIFIER_POS_NEG = ClassifierConv(WORD_SIZE, SENTENCE_SIZE)

# Compile the classifier
CLASSIFIER_POS_NEG.compile(optimizer='adam', loss=binary_crossentropy, metrics=['accuracy'])

CLASSIFIER_POS_NEG.fit(TWEET_SCALARS_TRAIN_POS_NEG, IMPORTANT_WORDS_TRAIN_POS_NEG, batch_size=BATCH_SIZE_POS_NEG, 
                       epochs=NB_EPOCHS_, validation_data=VALID_DATA_POS_NEG, class_weight=CLASS_WEIGHT_POS_NEG)

In [None]:
# --- Testing --- #
# Parameters
THRESHOLD_MAX_POS_NEG = 0.1
THRESHOLD_MIN_POS_NEG = 0
NB_THRESHOLDS_POS_NEG = 30

THRESHOLD_OPT_POS_NEG = 1
JACCARD_ACC_MAX_POS_NEG = 0
JACCARD_LIST_POS_NEG = []

# Predictions
CLASSIFIER_POS_NEG.trainable = False
PREDICTIONS_POS_NEG = CLASSIFIER_POS_NEG.predict(TWEET_SCALARS_VALID_POS_NEG)
print(PREDICTIONS_POS_NEG)

for threshold in np.linspace(THRESHOLD_MIN_POS_NEG, THRESHOLD_MAX_POS_NEG, NB_THRESHOLDS_POS_NEG):
    # Get the predicitions with the threshold
    PRED_THRESHOLD_POS_NEG = PREDICTIONS_POS_NEG > threshold
    # print(PRED_THRESHOLD[:10])
    
    # Get the string predictions
    PRED_STRING_POS_NEG = preds_to_strings(TWEET_ORIGINALS_VALID_POS_NEG, TWEET_STRINGS_VALID_POS_NEG, PRED_THRESHOLD_POS_NEG)

    # Compute the loss
    JACCARD_ACC_POS_NEG = mean_jaccard(LABEL_ORIGINALS_VALID_POS_NEG, PRED_STRING_POS_NEG)
    
    # Print results
    print("Jaccard score", JACCARD_ACC_POS_NEG)
    print("Threshold", threshold)
    print("\n")
        
    # Updates
    JACCARD_LIST_POS_NEG.append(JACCARD_ACC_POS_NEG)
    if JACCARD_ACC_POS_NEG > JACCARD_ACC_MAX_POS_NEG:
        JACCARD_ACC_MAX_POS_NEG = JACCARD_ACC_POS_NEG
        THRESHOLD_OPT_POS_NEG = threshold

In [None]:
plt.plot(JACCARD_LIST_POS_NEG)
plt.xlabel("threshold")
plt.ylabel("Jaccard score")
plt.savefig("../results/mlp_pos_neg.jpg")
plt.show()

In [None]:
# --- Save the weights for positive and negative sentiments --- #
# Path the save the weights
PATH_SAVE_WEIGHTS_POS_NEG = Path("../weights/conv_nb_samples_{}_nb_thresholds_{}_pos_neg.h5".format(NB_SAMPLES, NB_THRESHOLDS))

# Save the weights
CLASSIFIER_POS_NEG.save_weights(str(PATH_SAVE_WEIGHTS_POS_NEG))