In [1]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# Data base import

In [2]:
from utils import sample_data

nb_samples = 10
path_save = Path("../data/samples/sample_{}.csv".format(nb_samples))
path_csv = Path("../data/train.csv")

SMALL_DATA = sample_data.SampleData(
    path_csv, nb_samples=nb_samples,
    save=True, path_save=path_save
)

In [3]:
print(SMALL_DATA.sample_data)

           textID                                               text  \
10413  a4bb6136b2   Sorry but there is no parking space. And I ju...   
15225  d63253be9a   WASSUP BEAUTIFUL!!! FOLLOW ME!!  PEEP OUT MY ...   
15331  3de9819b40  In the mood for shrimp scampi but I don`t have...   
22638  add1c95bee          Star Trek.. Did not disappoint!  5 star!!   
20551  87954b6e95  On my way home in the sunshine with a bag full...   
22687  eaba181d46   prolly E71... I can`t think of anything else ...   
15340  2c570c2fb3   that is a good pic  All the guys looked good ...   
4524   7e9df3893e   Looks like you did the full Lincoln Marathon ...   
12510  74d3120b7f   YAY YOU! So proud of you!  and I`m not even b...   
1959   9f61b74d16  Something strange in the air lately. Been sett...   

                                           selected_text  sentiment  
10413                                              Sorry         -1  
15225  WASSUP BEAUTIFUL!!! FOLLOW ME!!  PEEP OUT MY N...          0

# Pre-processing

In [4]:
from utils.data_descriptor import vectorize_string, convert_labels, descriptor


ALPHANUM_ONLY = False
WORD_SIZE = 12
SENTENCE_SIZE = 20
FILL_WITH = "$"

In [5]:
X_string_train = np.zeros((nb_samples, SENTENCE_SIZE), dtype=object)
X_train = np.zeros((nb_samples, WORD_SIZE * SENTENCE_SIZE))
Y_train = np.zeros((nb_samples, SENTENCE_SIZE))

for i in range(len(SMALL_DATA.sample_data)):
    X_string_train[i] = vectorize_string(
        SMALL_DATA.sample_data["text"].to_numpy()[i],
        alphanumeric_only=ALPHANUM_ONLY,
        sentence_size=SENTENCE_SIZE,
        word_size=WORD_SIZE,
        fill_with=FILL_WITH
    )
    X_train[i] = descriptor(X_string_train[i], alphanumeric_only=ALPHANUM_ONLY).reshape(-1)
    
    Y_train[i] = convert_labels(
        X_string_train[i],
        vectorize_string(
            SMALL_DATA.sample_data["selected_text"].to_numpy()[i],
            alphanumeric_only=ALPHANUM_ONLY,
            sentence_size=SENTENCE_SIZE,
            word_size=WORD_SIZE,
            fill_with=FILL_WITH
        )
    )

In [6]:
print(X_string_train[0])
print(X_train[0])
print(Y_train[0])

['hi,$$$$$$$$$' 'the$$$$$$$$$' 'parody$$$$$$' 'for$$$$$$$$$'
 'iPhone$$$$$$' 'is$$$$$$$$$$' 'hysterical,$' 'not$$$$$$$$$'
 'because$$$$$' 'it`s$$$$$$$$' 'funny,$$$$$$' 'but$$$$$$$$$'
 'because$$$$$' 'one$$$$$$$$$' 'cannot$$$$$$' 'play$$$$$$$$'
 'the$$$$$$$$$' 'video$$$$$$$' 'on$$$$$$$$$$' 'iPhone$$$$$$']
[104. 105.  44.  36.  36.  36.  36.  36.  36.  36.  36.  36. 116. 104.
 101.  36.  36.  36.  36.  36.  36.  36.  36.  36. 112.  97. 114. 111.
 100. 121.  36.  36.  36.  36.  36.  36. 102. 111. 114.  36.  36.  36.
  36.  36.  36.  36.  36.  36. 105.  80. 104. 111. 110. 101.  36.  36.
  36.  36.  36.  36. 105. 115.  36.  36.  36.  36.  36.  36.  36.  36.
  36.  36. 104. 121. 115. 116. 101. 114. 105.  99.  97. 108.  44.  36.
 110. 111. 116.  36.  36.  36.  36.  36.  36.  36.  36.  36.  98. 101.
  99.  97. 117. 115. 101.  36.  36.  36.  36.  36. 105. 116.  96. 115.
  36.  36.  36.  36.  36.  36.  36.  36. 102. 117. 110. 110. 121.  44.
  36.  36.  36.  36.  36.  36.  98. 117. 116.  36.  36.

# Classification

In [7]:
from sklearn.neighbors import  KNeighborsRegressor

In [8]:
nb_neighbors = 2

knn = KNeighborsRegressor(nb_neighbors, weights="distance")
knn.fit(X_train, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='distance')

In [9]:
from utils.post_processing import pred_to_sentence, sentence_to_string, filter_character

pred = knn.predict(X_train)
meaning_sentences = pred_to_sentence(X_string_train, pred)

results = []
for sentence in meaning_sentences:
    result = ""
    for word in sentence:
        filtered_word = filter_character(word, "$")
        if len(filtered_word) != 0:
            result += filtered_word + " "
    results.append(result)
results=np.array(results, dtype=object)

In [10]:
print(results)

['hi, the parody for iPhone is hysterical, not because it`s funny, but because one cannot play the video on iPhone '
 'with ' 'sad ' '_Yavanna me too. she`s everywhere.. ' 'Watching Matilda '
 'it just might work. ' 'dont be ' 'Happy Mother`s Day '
 'wish i was 17 again ' 'bloody ']


In [11]:
print(SMALL_DATA.sample_data["selected_text"].to_numpy())

['hi, the parody for iPhone is hysterical, not because it`s funny, but because one cannot play the video on iPhone'
 'with' 'sad' '_Yavanna me too. she`s everywhere...' 'Watching Matilda'
 'it just might work.' 'dont be gloomy.' 'Happy Mother`s Day'
 'wish i was 17 again' 'bloody']


In [12]:
from utils.loss import jaccard

avg = 0
for i in range(len(results)):
    avg += jaccard(results[i], SMALL_DATA.sample_data["selected_text"].to_numpy()[i])
avg /= len(results)

print(avg)

0.9333333333333332
