In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [2]:
from utils import sample_data

NB_SAMPLES = 10
PATH_SAMPLE = Path("../data/samples/sample_{}.csv".format(NB_SAMPLES))
PATH_CSV = Path("../data/train.csv")

SMALL_DATA = sample_data.SampleData(
    PATH_CSV, nb_samples=NB_SAMPLES,
    save=True, path_save=PATH_SAMPLE
)

DATAFRAME = pd.read_csv(PATH_SAMPLE)
DATA = pd.DataFrame.to_numpy(DATAFRAME)

In [3]:
print(DATAFRAME)

       textID                                               text  \
0  b0658f9706          Not babysitting tonight. I miss that kid.   
1  6416bec5a1                             happy mothers day!  <3   
2  86fb428374                                  _d Zwarte maillot   
3  1efdd67d78      _Girl OOC sorry we keep missing each other...   
4  f2fda0303d  my little sister told me i have a double chin ...   
5  cf240ef11f            ha! you`re right...I know you`re right.   
6  698b6e5aef  I want tuna & salmon sashimi, B.C. rolls and d...   
7  a3527a3ccc  J, sorry about the bad night, hopefully my wor...   
8  f83955956a  i`m watching missing pieces, just coz the them...   
9  b70e61e2cb  whole foods, barton springs, yogurt spot & the...   

                                       selected_text  sentiment  
0                                               miss         -1  
1                                 happy mothers day!          1  
2                                  _d Zwarte maillot 

# Pre-processing

In [4]:
from utils.load_data import load_data
from utils.data_descriptor import convert_labels

ALPHANUM_ONLY = False
WORD_SIZE = 30
SENTENCE_SIZE = 50
FILL_WITH = "$"
SPLIT_PUNCTUATION = False # to tell wheter the puncutation "!?.;,/" etc are keeped sticked to a word or not

X_STRING, X_SCALAR = load_data(DATA, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, SPLIT_PUNCTUATION, ALPHANUM_ONLY)

Y = convert_labels(DATA, SENTENCE_SIZE, SPLIT_PUNCTUATION)

print(DATA.shape, X_STRING.shape, Y.shape, X_SCALAR.shape)

print("\nOriginal data :")
print(DATA[0])
print("\nFilled sentence :")
print(X_STRING[0])
print("\nLabel :")
print(Y[0])
print("\nDescriptor :")
print(X_SCALAR[0])

(10, 4) (10, 50) (10, 50) (10, 1501)

Original data :
['b0658f9706' 'Not babysitting tonight. I miss that kid.' 'miss' -1]

Filled sentence :
['Not$$$$$$$$$$$$$$$$$$$$$$$$$$$' 'babysitting$$$$$$$$$$$$$$$$$$$'
 'tonight.$$$$$$$$$$$$$$$$$$$$$$' 'I$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 'miss$$$$$$$$$$$$$$$$$$$$$$$$$$' 'that$$$$$$$$$$$$$$$$$$$$$$$$$$'
 'kid.$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '$$$$$$$$$$$$$$$$$$$

In [5]:
X_ORIGINAL = DATA[:, 1]

In [6]:
X_SCALAR.shape

(10, 1501)

# Classification

In [7]:
from sklearn.neighbors import KNeighborsRegressor

In [9]:
nb_neighbors = 1

knn = KNeighborsRegressor(nb_neighbors, weights="distance")
knn.fit(X_SCALAR, Y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                    weights='distance')

In [10]:
predictions = knn.predict(X_SCALAR)

In [11]:
from utils.post_processing import pred_to_string

results = np.zeros(len(predictions), dtype=object)
for i in range(len(predictions)):
    results[i] = pred_to_string(X_ORIGINAL[i], X_STRING[i], predictions[i])

In [12]:
for i in range(len(results)):
    print(DATA[i][1])
    print(DATA[i][2])
    print(results[i])
    print()

Not babysitting tonight. I miss that kid.
miss
miss 

happy mothers day!  <3
happy mothers day!
happy mothers day! 

_d Zwarte maillot
_d Zwarte maillot
_d Zwarte maillot

_Girl OOC sorry we keep missing each other...
sorry we keep missing each other...
sorry we keep missing each other...

my little sister told me i have a double chin  aww
my little sister told me i have a double chin  aww
my little sister told me i have a double chin aww

 ha! you`re right...I know you`re right.
you`re right.
you`re you`re right.

I want tuna & salmon sashimi, B.C. rolls and dragon eye.
I want tuna & salmon sashimi, B.C. rolls and dragon eye.
I want tuna & salmon sashimi, B.C. rolls and dragon eye.

J, sorry about the bad night, hopefully my workout updates gave you a little humor in your night..
J, sorry about the bad night, hopefully my workout updates gave you a little humor in your night..
J, sorry about the bad night, hopefully my workout updates gave you a little humor in your night..

i`m watch

In [13]:
from utils.loss import jaccard

avg = 0
for i in range(len(results)):
    avg += jaccard(results[i], DATA[i, 2])
    
    if jaccard(results[i], DATA[i, 2]) != 1:
        print("Error on", i)
avg /= len(results)

print(avg)

1.0


In [14]:
print(DATA[3][2])

for i in range(len(X_STRING[3])):
    print()
    print(X_STRING[3][i])
    print(Y[3][i])

sorry we keep missing each other...

_Girl$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

OOC$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

sorry$$$$$$$$$$$$$$$$$$$$$$$$$
1.0

we$$$$$$$$$$$$$$$$$$$$$$$$$$$$
1.0

keep$$$$$$$$$$$$$$$$$$$$$$$$$$
1.0

missing$$$$$$$$$$$$$$$$$$$$$$$
1.0

each$$$$$$$$$$$$$$$$$$$$$$$$$$
1.0

other...$$$$$$$$$$$$$$$$$$$$$$
1.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0.0

$$$$$$$$$$$$$$$$$$$$$$$$$$$