In [1]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [2]:
train_data = pd.read_csv(Path("../data/samples/sample_1000_train.csv")).to_numpy()
validation_data = pd.read_csv(Path("../data/samples/sample_100_validation.csv")).to_numpy()

In [3]:
print(train_data.shape, validation_data.shape)

(1000, 4) (100, 4)


# Pre-processing

In [4]:
from utils.load_data import load_data
from utils.data_descriptor import convert_labels

ALPHANUM_ONLY = False
WORD_SIZE = 30
SENTENCE_SIZE = 50
FILL_WITH = "$"
SPLIT_PUNCTUATION = False  # to tell wheter the puncutation "!?.;,/" etc are keeped sticked to a word or not
FEELING_WEIGHT = 14

X_TR_STRING, X_TR_SCALAR = load_data(
    train_data, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, SPLIT_PUNCTUATION, ALPHANUM_ONLY, FEELING_WEIGHT
)
X_TR_ORIGINAL = train_data[:, 1]
Y_TR = convert_labels(train_data, SENTENCE_SIZE, SPLIT_PUNCTUATION)

X_VAL_STRING, X_VAL_SCALAR = load_data(
    validation_data, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, SPLIT_PUNCTUATION, ALPHANUM_ONLY, FEELING_WEIGHT
)
X_VAL_ORIGINAL = validation_data[:, 1]
Y_VAL = convert_labels(validation_data, SENTENCE_SIZE, SPLIT_PUNCTUATION)

In [5]:
print(train_data.shape, X_TR_STRING.shape, X_TR_SCALAR.shape, Y_TR.shape)

print("\nOriginal data :")
print(X_TR_ORIGINAL[0])
print("\nFilled sentence :")
print(X_TR_STRING[0])
print("\nLabel :")
print(Y_TR[0])
print("\nDescriptor :")
print(X_TR_SCALAR[0])

(1000, 4) (1000, 50) (1000, 1501) (1000, 50)

Original data :
i feel really weird

Filled sentence :
['i$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' 'feel$$$$$$$$$$$$$$$$$$$$$$$$$$'
 'really$$$$$$$$$$$$$$$$$$$$$$$$' 'weird$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$

### Standardisation

In [6]:
mu = np.concatenate([X_TR_SCALAR[:, :-1].mean(axis=0), [0]])
sigma = X_TR_SCALAR[:, :-1].std(axis=0)

X_TR_SCALAR = X_TR_SCALAR - mu
X_VAL_SCALAR = X_VAL_SCALAR - mu

for i in range(len(sigma)):
    if sigma[i] != 0:
        X_TR_SCALAR[:, i] /= sigma[i]
        X_VAL_SCALAR[:, i] /= sigma[i]

# Classification

In [7]:
#!pip install xgboost

In [8]:
from sklearn.multioutput import MultiOutputRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb

In [9]:
# nb_neighbors = 26
# regressor = KNeighborsRegressor(nb_neighbors, weights="distance")

# regressor = MultiOutputRegressor(
#     xgb.XGBClassifier(objective="reg:logistic")
# )

regressor = MultiOutputRegressor(
    SVR(kernel="poly", degree=2)
)

regressor.fit(X_TR_SCALAR, Y_TR)

MultiOutputRegressor(estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=2,
                                   epsilon=0.1, gamma='scale', kernel='poly',
                                   max_iter=-1, shrinking=True, tol=0.001,
                                   verbose=False),
                     n_jobs=None)

In [10]:
predictions = regressor.predict(X_VAL_SCALAR)

In [11]:
from utils.post_processing import preds_to_strings

results_val = preds_to_strings(X_VAL_ORIGINAL, X_VAL_STRING, predictions)

In [12]:
from utils.loss import mean_jaccard

accuracy = mean_jaccard(validation_data[:, 2], results_val)
print(accuracy)

56.9449031353699
