In [None]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [None]:
from utils.split_feelings import split

PATH_TRAIN = Path("../data/samples/sample_1000_train.csv")
PATH_VALID = Path("../data/samples/sample_1000_validation.csv")

SPLIT_FEELINGS = True

if SPLIT_FEELINGS:
    POSITIVES_WITH_NEGATIVES = False

    train_pos, train_neutral, train_neg = split(PATH_TRAIN, POSITIVES_WITH_NEGATIVES)
    valid_pos, valid_neutral, valid_neg = split(PATH_VALID, POSITIVES_WITH_NEGATIVES)
    
    train_data = train_pos
    validation_data = valid_pos

else:
    train_data = pd.read_csv(PATH_TRAIN).to_numpy()
    validation_data = pd.read_csv(PATH_VALID).to_numpy()

In [None]:
print(train_data.shape, validation_data.shape)

In [None]:
from utils.clean_data import clean_data 

train_data = clean_data(train_data)
validation_data = clean_data(validation_data)

In [None]:
print(train_data.shape, validation_data.shape)

# Pre-processing

In [None]:
# -- Get the original tweets -- #

X_TR_ORIGINAL = train_data[:, 1]
X_VAL_ORIGINAL = validation_data[:, 1]

Y_TR_ORIGINAL = train_data[:, 2]
Y_VAL_ORIGINAL = validation_data[:, 2]

FEELINGS_TR = train_data[:, 3]
FEELINGS_VAL = validation_data[:, 3]

In [None]:
from descriptors.tokenizer.tokenizer import Tokenizer
from descriptors.tweet_string.create_strings import create_strings

WORD_SIZE = 100
SENTENCE_SIZE = 100
FILL_WITH = "$"
FEELING_WEIGHT = 1

# -- Define the tokenizer -- #
TOKENIZER = Tokenizer()

# -- Create sentences -- #
X_TR_STRING = create_strings(X_TR_ORIGINAL, TOKENIZER, SENTENCE_SIZE)
X_VAL_STRING = create_strings(X_VAL_ORIGINAL, TOKENIZER, SENTENCE_SIZE)

Y_TR_STRING = create_strings(Y_TR_ORIGINAL, TOKENIZER, SENTENCE_SIZE)
Y_VAL_STRING = create_strings(Y_VAL_ORIGINAL, TOKENIZER, SENTENCE_SIZE)


test = "Thats it, its the end. Tears for Fears vs Eric Prydz, DJ Hero   http://bit.ly/2Hpbg4"
test = "OMG  I BROKE DOWN AND HAD PIZZA BECAUSE I WAS STRESSED OUT     you mad at me?"
print('input tweet : \n"' + test + '"')
print('\ntokenized tweet : ')
print(TOKENIZER.tokenize(test, SENTENCE_SIZE))

In [None]:
from descriptors.descriptor_ascii.load_data import load_data
from descriptors.descriptor_one_hot.descriptor import descriptor_one_hot
from descriptors.tweet_label.create_labels import create_labels

# ALPHANUM_ONLY = False

# to tell wheter the puncutation "!?.;,/" etc are keeped sticked to a word or not
# SPLIT_PUNCTUATION = False 


ONE_HOT = False

# training set --------------------------------------------
if ONE_HOT:
    X_TR_SCALAR = descriptor_one_hot(
        X_TR_STRING, SENTENCE_SIZE, WORD_SIZE,
        feelings=FEELINGS_TR, feeling_weight=FEELING_WEIGHT, fill_with_ones=True)
    
else:
    X_TR_STRING_FILLED, X_TR_SCALAR = load_data(
        X_TR_STRING, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, FEELING_WEIGHT, feelings=FEELINGS_TR)

# X_TR_SCALAR = descriptor_one_hot(
#     X_TR_STRING, SENTENCE_SIZE, WORD_SIZE,
#     feelings=FEELINGS_TR, feeling_weight=FEELING_WEIGHT, fill_with_ones=True
# )

Y_TR = create_labels(X_TR_STRING, Y_TR_STRING, SENTENCE_SIZE)


# validation set ------------------------------------------
if ONE_HOT:
    X_VAL_SCALAR = descriptor_one_hot(
        X_VAL_STRING, SENTENCE_SIZE, WORD_SIZE,
        feelings=FEELINGS_VAL, feeling_weight=FEELING_WEIGHT, fill_with_ones=True)
    
else:
    X_VAL_STRING_FILLED, X_VAL_SCALAR = load_data(
        X_VAL_STRING, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, FEELING_WEIGHT, feelings=FEELINGS_VAL)

# X_VAL_SCALAR = descriptor_one_hot(
#     X_VAL_STRING, SENTENCE_SIZE, WORD_SIZE,
#     feelings=FEELINGS_VAL, feeling_weight=FEELING_WEIGHT, fill_with_ones=True
# )

Y_VAL = create_labels(X_VAL_STRING, Y_VAL_STRING, SENTENCE_SIZE)

In [None]:
IDX = 4
print("Original data :")
print(X_TR_ORIGINAL[IDX])
print("\nFilled sentence :")
print(X_TR_STRING_FILLED[IDX])
# print("\nLabel :")
# print(Y_TR[IDX])
# print(Y_TR_ORIGINAL[IDX])
# print(Y_TR_STRING[IDX])
print("\nDescriptor :")
# X_TR_SCALAR = (X_TR_SCALAR != 36) * X_TR_SCALAR
print(X_TR_SCALAR[IDX])

### Standardisation

In [None]:
mu = np.concatenate([X_TR_SCALAR[:, :-1].mean(axis=0), [0]])
sigma = X_TR_SCALAR[:, :-1].std(axis=0)

X_TR_SCALAR = X_TR_SCALAR - mu
X_VAL_SCALAR = X_VAL_SCALAR - mu

for i in range(len(sigma)):
    if sigma[i] != 0:
        X_TR_SCALAR[:, i] /= sigma[i]
        X_VAL_SCALAR[:, i] /= sigma[i]

# Classification

In [None]:
#!pip install xgboost

In [None]:
from sklearn.multioutput import MultiOutputRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb

In [None]:
nb_neighbors = 20
# regressor = KNeighborsRegressor(nb_neighbors, weights="distance", metric="jaccard")
regressor = KNeighborsRegressor(nb_neighbors, weights="distance")

# regressor = MultiOutputRegressor(
#     xgb.XGBRegressor(
#         objective="reg:squaredlogerror",
#         learning_rate=0.3, gamma=0, max_depth=1000, reg_lambda=1, tree_method="hist"
#     )
# )

# regressor = MultiOutputRegressor(
#    SVR(kernel="poly", degree=2)
# )

regressor.fit(X_TR_SCALAR, Y_TR)

In [None]:
predictions_val = regressor.predict(X_VAL_SCALAR)
predictions_tr = regressor.predict(X_TR_SCALAR)
print(predictions_val[1])

In [None]:
from utils.post_processing import preds_to_strings

proba_seil = 0.5

results_val = preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, predictions_val > proba_seil)
results_tr = preds_to_strings(X_TR_ORIGINAL, X_TR_STRING, predictions_tr > proba_seil)

# Accuracy and comparisons

In [None]:
from utils.loss import mean_jaccard

### Average jaccard of predictions
Similarity between prediction and ground truth

In [None]:
# On train

mean_jaccard(train_data[:, 2], results_tr)

In [None]:
# On validation

mean_jaccard(validation_data[:, 2], results_val)

Similarity between actual prediction and trivial prediction (full tweet)

In [None]:
# On train

mean_jaccard(train_data[:, 1], results_tr)

In [None]:
# On val

mean_jaccard(validation_data[:, 1], results_val)

### Average jaccard of full tweets
Similarity between trivial prediction and ground truth

In [None]:
# On validation

mean_jaccard(validation_data[:, 2], validation_data[:, 1])

### Average jaccard of labels
Similarity between constructed labels and ground truth
$\neq 100\%$ because of word and sentence cropping, and database flaws

In [None]:
# print(np.where(Y_TR[:, 0] == 0))
# print(np.where(Y_VAL[:, 0] == 0))

In [None]:
from utils.loss import mean_jaccard

accuracy = mean_jaccard(Y_VAL_ORIGINAL, results_val)
print(accuracy)

In [None]:
for i in range(len(predictions_val)):
    print("Tweet:", i)
    print()
    print("Ground truth", Y_VAL[i])
    print("Predictions", (predictions_val[i] > proba_seil).astype(float))
    print()
    print("Original Tweet:", X_VAL_ORIGINAL[i])
    print("Cut tweet", X_VAL_STRING[i])
    print()
    print("Meaningfull part:", validation_data[i, 2])
    print("Result:", results_val[i])
    print("\n")

In [None]:
def test_knn(neighbors, threshold):
    regressor = KNeighborsRegressor(neighbors, weights="distance")
    regressor.fit(X_TR_SCALAR, Y_TR)
    predictions = regressor.predict(X_VAL_SCALAR)
    predictions_bin = (predictions > threshold)*1
    results_val = preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, predictions_bin)
    accuracy = mean_jaccard(Y_VAL_ORIGINAL, results_val)

    return accuracy

In [None]:
y = []
for i in range(9):
    regressor = KNeighborsRegressor(300, weights="distance")
    regressor.fit(X_TR_SCALAR, Y_TR)
    predictions = regressor.predict(X_VAL_SCALAR)
    predictions_bin = (predictions > i/200)*1
    results_val = preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, predictions_bin)
    accuracy = mean_jaccard(Y_VAL_ORIGINAL, results_val)
    y.append(accuracy)
plt.plot(y)
plt.show()

In [None]:
NB_NEIGHBORS_MAX = 82
NB_NEIGHBORS_MIN = 2
STEP_NEIGH = 10

THRESHOLD_MAX = 15
THRESHOLD_MIN = 0
STEP_THRES = 5

jaccard_accu = np.zeros((THRESHOLD_MAX // STEP_THRES, (NB_NEIGHBORS_MAX - NB_NEIGHBORS_MIN)//STEP_NEIGH))

for nb_neigh in range(NB_NEIGHBORS_MIN, NB_NEIGHBORS_MAX, STEP_NEIGH):
    for threshold in range(THRESHOLD_MIN, THRESHOLD_MAX, STEP_THRES):
        jaccard_accu[threshold // STEP_THRES, (nb_neigh - NB_NEIGHBORS_MIN) // STEP_NEIGH] = test_knn(nb_neigh, threshold / 100)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.pyplot import cm
from pathlib import Path

PATH_SAVE = Path("../results")

# Create the grid and the axes
fig = plt.figure(figsize=(14,10))
LIST_NEIGHBORS = np.arange(NB_NEIGHBORS_MIN, NB_NEIGHBORS_MAX, STEP_NEIGH)
LIST_THRES = np.arange(THRESHOLD_MIN, THRESHOLD_MAX, STEP_THRES) / 100
(NEIGHBORS, THRES) = np.meshgrid(LIST_NEIGHBORS, LIST_THRES)
ax = Axes3D(fig)

# Show the plots
surf = ax.plot_surface(NEIGHBORS, THRES, jaccard_accu, cmap=cm.coolwarm)
ax.set_xlabel('nb neighbors')
ax.set_ylabel('threshold')
ax.set_zlabel('Jaccard Score')
fig.colorbar(surf, shrink=0.5, aspect=5)

x = np.argmax(jaccard_accu)
i = x // len(LIST_NEIGHBORS)
j = x % len(LIST_NEIGHBORS)
print("Le maximum est atteint en", (LIST_THRES[i], LIST_NEIGHBORS[j]))
print("Pour un score de", jaccard_accu[i, j])

plt.savefig(PATH_SAVE / ("ascii_t" + str(LIST_THRES[i]) + "_n" + str(LIST_NEIGHBORS[j]) + "_positives.jpg"))
plt.show()

In [None]:
jaccard_accu.shape

In [None]:
mean_jaccard(Y_VAL_ORIGINAL, preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, Y_VAL))

In [None]:
mean_jaccard(Y_VAL_ORIGINAL, X_VAL_ORIGINAL)