In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# Data base import

In [None]:
from utils import sample_data

nb_samples = 10
path_save = Path("../data/samples/sample_{}.csv".format(nb_samples))
path_csv = Path("../data/train.csv")

SMALL_DATA = sample_data.SampleData(
    path_csv, nb_samples=nb_samples,
    save=True, path_save=path_save
)

In [None]:
print(SMALL_DATA.sample_data)

# Pre-processing

In [None]:
from utils.data_descriptor import vectorize_string, convert_labels, descriptor


ALPHANUM_ONLY = False
WORD_SIZE = 12
SENTENCE_SIZE = 20
FILL_WITH = "$"

In [None]:
X_string_train = np.zeros((nb_samples, SENTENCE_SIZE), dtype=object)
X_train = np.zeros((nb_samples, WORD_SIZE * SENTENCE_SIZE))
Y_train = np.zeros((nb_samples, SENTENCE_SIZE))

for i in range(len(SMALL_DATA.sample_data)):
    X_string_train[i] = vectorize_string(
        SMALL_DATA.sample_data["text"].to_numpy()[i],
        alphanumeric_only=ALPHANUM_ONLY,
        sentence_size=SENTENCE_SIZE,
        word_size=WORD_SIZE,
        fill_with=FILL_WITH
    )
    X_train[i] = descriptor(X_string_train[i], alphanumeric_only=ALPHANUM_ONLY).reshape(-1)
    
    Y_train[i] = convert_labels(
        X_string_train[i],
        vectorize_string(
            SMALL_DATA.sample_data["selected_text"].to_numpy()[i],
            alphanumeric_only=ALPHANUM_ONLY,
            sentence_size=SENTENCE_SIZE,
            word_size=WORD_SIZE,
            fill_with=FILL_WITH
        )
    )

In [None]:
print(X_string_train[0])
print(X_train[0])
print(Y_train[0])

# Classification

In [None]:
from sklearn.neighbors import  KNeighborsRegressor

In [None]:
nb_neighbors = 2

knn = KNeighborsRegressor(nb_neighbors, weights="distance")
knn.fit(X_train, Y_train)

In [None]:
from utils.post_processing import pred_to_sentence, sentence_to_string, filter_character

pred = knn.predict(X_train)
meaning_sentences = pred_to_sentence(X_string_train, pred)

results = []
for sentence in meaning_sentences:
    result = ""
    for word in sentence:
        filtered_word = filter_character(word, "$")
        if len(filtered_word) != 0:
            result += filtered_word + " "
    results.append(result)
results=np.array(results, dtype=object)

In [None]:
print(results)

In [None]:
print(SMALL_DATA.sample_data["selected_text"].to_numpy())

In [None]:
from utils.loss import jaccard

avg = 0
for i in range(len(results)):
    avg += jaccard(results[i], SMALL_DATA.sample_data["selected_text"].to_numpy()[i])
avg /= len(results)

print(avg)