In [None]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [None]:
from utils.split_feelings import split

PATH_TRAIN = Path("../data/samples/sample_1000_train.csv")
PATH_VALID = Path("../data/samples/sample_100_validation.csv")

SPLIT_FEELINGS = True

if SPLIT_FEELINGS:
    POSITIVES_WITH_NEGATIVES = True

    train_pos, train_neutral, train_neg = split(PATH_TRAIN, POSITIVES_WITH_NEGATIVES)
    valid_pos, valid_neutral, valid_neg = split(PATH_VALID, POSITIVES_WITH_NEGATIVES)
    
    train_data = train_pos
    validation_data = valid_pos

else:
    train_data = pd.read_csv(PATH_TRAIN).to_numpy()
    validation_data = pd.read_csv(PATH_VALID).to_numpy()

In [None]:
print(train_data.shape, validation_data.shape)

In [None]:
from utils.clean_data import clean_data 

train_data = clean_data(train_data)
traivalidation_data = clean_data(validation_data)

In [None]:
print(train_data.shape, validation_data.shape)

# Pre-processing

In [None]:
# -- Get the original tweets -- #

X_TR_ORIGINAL = train_data[:, 1]
X_VAL_ORIGINAL = validation_data[:, 1]

Y_TR_ORIGINAL = train_data[:, 2]
Y_VAL_ORIGINAL = validation_data[:, 2]

FEELINGS_TR = train_data[:, 3]
FEELINGS_VAL = validation_data[:, 3]

In [None]:
from descriptors.tokenizer.tokenizer import Tokenizer
from descriptors.tweet_string.create_strings import create_strings

WORD_SIZE = 30
SENTENCE_SIZE = 50
FILL_WITH = "$"
FEELING_WEIGHT = 1

# -- Define the tokenizer -- #
TOKENIZER = Tokenizer()

# -- Create sentences -- #
X_TR_STRING = create_strings(X_TR_ORIGINAL, TOKENIZER, SENTENCE_SIZE)
X_VAL_STRING = create_strings(X_VAL_ORIGINAL, TOKENIZER, SENTENCE_SIZE)

Y_TR_STRING = create_strings(Y_TR_ORIGINAL, TOKENIZER, SENTENCE_SIZE)
Y_VAL_STRING = create_strings(Y_VAL_ORIGINAL, TOKENIZER, SENTENCE_SIZE)

print(X_VAL_STRING[0])

In [None]:
from descriptors.descriptor_ascii.load_data import load_data
from descriptors.descriptor_one_hot.descriptor import descriptor_one_hot
from descriptors.tweet_label.create_labels import create_labels

# ALPHANUM_ONLY = False
# SPLIT_PUNCTUATION = False  # to tell wheter the puncutation "!?.;,/" etc are keeped sticked to a word or not

# training set --------------------------------------------
X_TR_STRING_FILLED, X_TR_SCALAR = load_data(
    X_TR_STRING, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, FEELING_WEIGHT, feelings=FEELINGS_TR
)

# X_TR_SCALAR = descriptor_one_hot(
#     X_TR_STRING, SENTENCE_SIZE, WORD_SIZE,
#     feelings=FEELINGS_TR, feeling_weight=FEELING_WEIGHT, fill_with_ones=True
# )

Y_TR = create_labels(X_TR_STRING, Y_TR_STRING, SENTENCE_SIZE)

# validation set ------------------------------------------
X_VAL_STRING_FILLED, X_VAL_SCALAR = load_data(
    X_VAL_STRING, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, FEELING_WEIGHT, feelings=FEELINGS_VAL
)

# X_VAL_SCALAR = descriptor_one_hot(
#     X_VAL_STRING, SENTENCE_SIZE, WORD_SIZE,
#     feelings=FEELINGS_VAL, feeling_weight=FEELING_WEIGHT, fill_with_ones=True
# )

Y_VAL = create_labels(X_VAL_STRING, Y_VAL_STRING, SENTENCE_SIZE)

In [None]:
IDX = 1
print("Original data :")
print(X_TR_ORIGINAL[IDX])
# print("\nFilled sentence :")
# print(X_TR_STRING_FILLED[IDX])
print("\nLabel :")
print(Y_TR[IDX])
print(Y_TR_ORIGINAL[IDX])
print(Y_TR_STRING[IDX])
print("\nDescriptor :")
print(X_TR_SCALAR[IDX])

### Standardisation

In [None]:
mu = np.concatenate([X_TR_SCALAR[:, :-1].mean(axis=0), [0]])
sigma = X_TR_SCALAR[:, :-1].std(axis=0)

X_TR_SCALAR = X_TR_SCALAR - mu
X_VAL_SCALAR = X_VAL_SCALAR - mu

for i in range(len(sigma)):
    if sigma[i] != 0:
        X_TR_SCALAR[:, i] /= sigma[i]
        X_VAL_SCALAR[:, i] /= sigma[i]

# Classification

In [None]:
#!pip install xgboost

In [None]:
from sklearn.multioutput import MultiOutputRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb

In [None]:
nb_neighbors = 5
# regressor = KNeighborsRegressor(nb_neighbors, weights="distance", metric="jaccard")
regressor = KNeighborsRegressor(nb_neighbors, weights="distance")

# regressor = MultiOutputRegressor(
#     xgb.XGBRegressor(objective="reg:squaredlogerror", eta=1, gamma=0, max_depth=10)
# )

#regressor = MultiOutputRegressor(
#    SVR(kernel="poly", degree=2)
#)

regressor.fit(X_TR_SCALAR, Y_TR)

In [None]:
predictions = regressor.predict(X_VAL_SCALAR)
print(predictions[1])

In [None]:
# print(X_VAL_ORIGINAL[17])
# print(X_VAL_STRING[17])

In [None]:
predictions_bin = (predictions > 0.5)*1

In [None]:
from utils.post_processing import preds_to_strings

results_val = preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, predictions_bin)

In [None]:
# for i in range(3,8):
#     print(i)
#     print(Y_VAL[i])
#     print(predictions[i])
#     print(predictions_bin[i])
#     print(X_VAL_ORIGINAL[i])
#     print(Y_VAL_ORIGINAL[i])
#     print(results_val[i])
#     print("\n")

In [None]:
# print(np.where(Y_TR[:, 0] == 0))
# print(np.where(Y_VAL[:, 0] == 0))

In [None]:
from utils.loss import mean_jaccard

accuracy = mean_jaccard(Y_VAL_ORIGINAL, results_val)
print(accuracy)

In [None]:
def test_knn(neighbors, threshold):
    regressor = KNeighborsRegressor(neighbors, weights="distance")
    regressor.fit(X_TR_SCALAR, Y_TR)
    predictions = regressor.predict(X_VAL_SCALAR)
    predictions_bin = (predictions > threshold)*1
    results_val = preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, predictions_bin)
    accuracy = mean_jaccard(Y_VAL_ORIGINAL, results_val)
    
    return accuracy

In [None]:
y= []
for i in range(5):
    regressor = KNeighborsRegressor(14, weights="distance")
    regressor.fit(X_TR_SCALAR, Y_TR)
    predictions = regressor.predict(X_VAL_SCALAR)
    predictions_bin = (predictions > i/10)*1
    results_val = preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, predictions_bin)
    accuracy = mean_jaccard(Y_VAL_ORIGINAL, results_val)
    y.append(accuracy)
plt.plot(y)
plt.show()

In [None]:
NB_NEIGHBORS_MAX = 20
NB_NEIGHBORS_MIN = 2
STEP_NEIGH = 2

THRESHOLD_MAX = 30
THRESHOLD_MIN = 0
STEP_THRES = 5

jaccard_accu = np.zeros((THRESHOLD_MAX // STEP_THRES, (NB_NEIGHBORS_MAX - NB_NEIGHBORS_MIN)//STEP_NEIGH))

for nb_neigh in range(NB_NEIGHBORS_MIN, NB_NEIGHBORS_MAX, STEP_NEIGH):
    for threshold in range(THRESHOLD_MIN, THRESHOLD_MAX, STEP_THRES):
        jaccard_accu[threshold // STEP_THRES, (nb_neigh - NB_NEIGHBORS_MIN) // STEP_NEIGH] = test_knn(nb_neigh, threshold / 100)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.pyplot import cm


# Create the grid and the axes
fig = plt.figure(figsize=(14,10))
LIST_NEIGHBORS = np.arange(NB_NEIGHBORS_MIN, NB_NEIGHBORS_MAX, STEP_NEIGH)
LIST_THRES = np.arange(THRESHOLD_MIN, THRESHOLD_MAX, STEP_THRES) / 100
(NEIGHBORS, THRES) = np.meshgrid(LIST_NEIGHBORS, LIST_THRES)
ax = Axes3D(fig)

# Show the plots
surf = ax.plot_surface(NEIGHBORS, THRES, jaccard_accu, cmap=cm.coolwarm)
ax.set_xlabel('nb neighbors')
ax.set_ylabel('threshold')
ax.set_zlabel('Jaccard Score')
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()

x = np.argmax(jaccard_accu)
i = x // len(LIST_THRES)
j = x % len(LIST_NEIGHBORS)
print("Le maximum est atteint en", (LIST_THRES[i], LIST_NEIGHBORS[j]))
print("Pour un score de ", jaccard_accu[i, j])

In [None]:
jaccard_accu.shape

In [None]:
mean_jaccard(validation_data[:, 2], preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, Y_VAL))