In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [2]:
from utils import sample_data

NB_SAMPLES = 10
PATH_SAMPLE = Path("../data/samples/sample_{}.csv".format(NB_SAMPLES))
PATH_CSV = Path("../data/train.csv")

SMALL_DATA = sample_data.SampleData(
    PATH_CSV, nb_samples=NB_SAMPLES,
    save=True, path_save=PATH_SAMPLE
)

DATAFRAME = pd.read_csv(PATH_SAMPLE)
DATA = pd.DataFrame.to_numpy(DATAFRAME)

In [3]:
print(DATAFRAME)

       textID                                               text  \
0  e6c9c4b498  Im glad that wasnt my real diver theory test! ...   
1  b2423564f7                         AWW thanks hopefully it is   
2  983945662b  the columbus blue jackes may be movieing to an...   
3  46de360d09  Why do you hurt me? Does it bring you joy to s...   
4  d8de266e5b  said final farewells to roommate.  almost fini...   
5  1b9c6db26b  : What a let down! No MRI today, neurosurgeon ...   
6  8708ae64b8            ugly. What programmes do you have open?   
7  0903386a1d   not at my workplace.   but a short-sleeved sh...   
8  20021de78f                         thank you for your comment   
9  64395a383e  i beat aye to the music hall.  babyy, im like ...   

                                       selected_text  sentiment  
0  Im glad that wasnt my real diver theory test! ...          0  
1                                         AWW thanks          1  
2                                         s sad news 

# Pre-processing

In [4]:
from utils.load_data import load_data

ALPHANUM_ONLY = False
WORD_SIZE = 30
SENTENCE_SIZE = 50
FILL_WITH = "$"
SPLIT_PUNCTUATION = True # False if the puncutation "!?.;,/" etc are kept stuck to a word

X_STRING, X_SCALAR, Y = load_data(
    DATA, WORD_SIZE, SENTENCE_SIZE, FILL_WITH, SPLIT_PUNCTUATION, ALPHANUM_ONLY
)

print(DATA.shape, X_STRING.shape, Y.shape, X_SCALAR.shape)

print("\nOriginal data :")
print(DATA[0])
print("\nFilled sentence :")
print(X_STRING[0])
print("\nLabel :")
print(Y[0])
print("\nDescriptor :")
print(X_SCALAR[0])

(10, 4) (10, 50) (10, 50) (10, 1501)

Original data :
['e6c9c4b498'
 'Im glad that wasnt my real diver theory test! I failed  i got 70% 35/50 questions right but i did have 35mins left lol x'
 'Im glad that wasnt my real diver theory test! I failed  i got 70% 35/50 questions right but i did have 35mins left lol x'
 0]

Filled sentence :
['Im$$$$$$$$$$$$$$$$$$$$$$$$$$$$' 'glad$$$$$$$$$$$$$$$$$$$$$$$$$$'
 'that$$$$$$$$$$$$$$$$$$$$$$$$$$' 'wasnt$$$$$$$$$$$$$$$$$$$$$$$$$'
 'my$$$$$$$$$$$$$$$$$$$$$$$$$$$$' 'real$$$$$$$$$$$$$$$$$$$$$$$$$$'
 'diver$$$$$$$$$$$$$$$$$$$$$$$$$' 'theory$$$$$$$$$$$$$$$$$$$$$$$$'
 'test$$$$$$$$$$$$$$$$$$$$$$$$$$' '!$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 'I$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' 'failed$$$$$$$$$$$$$$$$$$$$$$$$'
 'i$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' 'got$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '70$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '%$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '35$$$$$$$$$$$$$$$$$$$$$$$$$$$$' '/$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
 '50$$$$$$$$$$$$$$$$$$$$$$$$$$$$' 'questions$$$$$$$$$$$$$$

In [5]:
X_ORIGINAL = DATA[:, 1]

In [6]:
X_SCALAR.shape

(10, 1501)

# Classification

In [7]:
from sklearn.neighbors import KNeighborsRegressor

In [8]:
nb_neighbors = 1

knn = KNeighborsRegressor(nb_neighbors, weights="distance")
knn.fit(X_SCALAR, Y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                    weights='distance')

In [9]:
predictions = knn.predict(X_SCALAR)

In [10]:
from utils.post_processing import pred_to_string

results = np.zeros(len(predictions), dtype=object)
for i in range(len(predictions)):
    results[i] = pred_to_string(X_ORIGINAL[i], X_STRING[i], predictions[i])

In [11]:
for i in range(len(results)):
    print(DATA[i][1])
    print(DATA[i][2])
    print(results[i])
    print()

Im glad that wasnt my real diver theory test! I failed  i got 70% 35/50 questions right but i did have 35mins left lol x
Im glad that wasnt my real diver theory test! I failed  i got 70% 35/50 questions right but i did have 35mins left lol x
Im glad that wasnt my real diver theory test! I failed i got 70% 35/50 questions right but i did have 35mins left lol x

 AWW thanks hopefully it is
AWW thanks
AWW thanks 

the columbus blue jackes may be movieing to anew city to play at  thats sad news
s sad news
sad news

Why do you hurt me? Does it bring you joy to see me cry? You know I love you more then anything and yet u break my heart everyday!
love
love 

said final farewells to roommate.  almost finished packing then it`s dc or bust on the 3rd. california: i divorce you x 3!
said final farewells to roommate.  almost finished packing then it`s dc or bust on the 3rd. california: i divorce you x 3!
said final farewells to roommate. almost finished packing then it`s dc or bust on the 3rd. cal

In [12]:
from utils.loss import jaccard

avg = 0
for i in range(len(results)):
    avg += jaccard(results[i], DATA[i, 2])
    
    if jaccard(results[i], DATA[i, 2]) != 1:
        print("Error on", i)
avg /= len(results)

print(avg)

Error on 2
0.9666666666666666


In [13]:
print(DATA[3][2])

for i in range(len(X_STRING[3])):
    print()
    print(X_STRING[3][i])
    print(Y[3][i])

love

Why$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

do$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

you$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

hurt$$$$$$$$$$$$$$$$$$$$$$$$$$
0

me$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

?$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

Does$$$$$$$$$$$$$$$$$$$$$$$$$$
0

it$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

bring$$$$$$$$$$$$$$$$$$$$$$$$$
0

you$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

joy$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

to$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

see$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

me$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

cry$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

?$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

You$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

know$$$$$$$$$$$$$$$$$$$$$$$$$$
0

I$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

love$$$$$$$$$$$$$$$$$$$$$$$$$$
1

you$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

more$$$$$$$$$$$$$$$$$$$$$$$$$$
0

then$$$$$$$$$$$$$$$$$$$$$$$$$$
0

anything$$$$$$$$$$$$$$$$$$$$$$
0

and$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

yet$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

u$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

break$$$$$$$$$$$$$$$$$$$$$$$$$
0

my$$$$$$$$$$$$$$$$$$$$$$$$$$$$
0

heart$$$