In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [2]:
from utils import sample_data

# -- Get the data -- #
NB_SAMPLES = 10
PATH_SAMPLE = Path("../data/samples/sample_{}_train.csv".format(NB_SAMPLES))
DATA_SAMPLE = pd.read_csv(PATH_SAMPLE).to_numpy()


# -- Clean the data -- #
from utils.clean_data import clean_data
DATA_SAMPLE = clean_data(DATA_SAMPLE)

In [3]:
print(DATA_SAMPLE)

[['20951ebfde'
  ' While im stuck INSIDE in Elk Grove Village working all day   Someone should enjoy it!'
  'While im stuck INSIDE in Elk Grove Village working all day   Someone should enjoy it'
  0]
 ['e283379cad'
  'hates the net. ayaw bumukas ng twitter.  http://plurk.com/p/wxlxs'
  'hates the net.' -1]
 ['0bc3d2ce9f'
  'Im feeling so nostalgic. Im sad. But happy. I don`t now how to feel. It`s over, but not at the same time. It just feels over.  i love you.'
  'love' 1]
 ['e316d2f54b' '_in_NH night bud' '_in_NH night bud' 0]
 ['57d554e002' 'time for work!' 'time for work!' 0]
 ['35d95a6bfa'
  'http://twitpic.com/4wqfv - dark berry mocha frapp.. heaven.. TRY IT EVERYONE!!  here.. let me pass it to you'
  'dark berry mocha frapp.. heaven.. TRY IT EVERYONE!!  here.. let me pass it to you'
  1]
 ['a2f6c5cca9'
  'Playing with the munchkin today, talking cakes and getting ready for a yard sale tomorrow. Not looking forward to that'
  'Not looking forward to that' -1]
 ['dcacc9ef8f'
  'The

# Pre-processing
#### TWEET_ORIGINAL : List of the tweets : Array, shape = (len(nb_tweets))
#### TWEET_STRING : List of the list of the word of each tweet : List of list of string
#### TWEET_SCALAR : List of the description of each tweet : Array, shape = (len(nb_tweets), sentence_size * word_size)
#### IMPORTANT_WORDS : List of the label of each tweet : Array, shape = (len(nb_tweets), sentence_size)

In [4]:
# -- Parameters -- #
WORD_SIZE = 50  # 50 or 100 or 200 or 300
SENTENCE_SIZE = 50  # What ever
FILL_WITH = 0  # If a word is not in the dictionary, [0, ..., 0] will describe it.
OPTIONS = [WORD_SIZE, SENTENCE_SIZE, FILL_WITH]

SENTIMENT_WEIGHT = 2  # Multiply the sentiment by a factor

In [5]:
# -- Get the original tweets -- #
TWEET_ORIGINAL = DATA_SAMPLE[:, 1]
print("First tweet :")
print(TWEET_ORIGINAL[0])
print("Shape of TWEET_ORIGINAL :", TWEET_ORIGINAL.shape)

First tweet :
 While im stuck INSIDE in Elk Grove Village working all day   Someone should enjoy it!
Shape of TWEET_ORIGINAL : (9,)


In [7]:
from descriptors.tweet_string.create_strings import create_strings
from descriptors.tokenizer.tokenizer import Tokenizer

# Initialize the tokenizer
TOKENIZER = Tokenizer()

# -- Get the decomposition of the tweets -- #
TWEET_STRING = create_strings(TWEET_ORIGINAL, TOKENIZER)
print("Decomposition of the first tweet :")
print(TWEET_STRING[0])
print("Length of TWEET_STRING :", len(TWEET_STRING))


Decomposition of the first tweet :
['While', 'im', 'stuck', 'INSIDE', 'in', 'Elk', 'Grove', 'Village', 'working', 'all', 'day', 'Someone', 'should', 'enjoy', 'it', '!']
Length of TWEET_STRING : 9


In [None]:
from descriptors.descriptor_glove.descriptor_glove import tweet_scalar_glove
from utils.standardize import standardize

# Get the dictionary
PATH_DICTIONARY = Path("../data/glove_descriptor/glove.6B.{}d.txt".format(WORD_SIZE))
DICTIONARY = pd.read_csv(PATH_DICTIONARY, sep=" ", header=None)

# Additional dictionary
ADDITIONAL_DIC = {"..": "...", "<3": "love"}

# Get the sentiments
SENTIMENTS = DATA_SAMPLE[:, -1]

# -- Get the decriptions of each tweets -- #
TWEET_SCALAR = tweet_scalar_glove(TWEET_STRING, SENTIMENTS, DICTIONARY, ADDITIONAL_DIC, OPTIONS)

# Standardize the tweet descriptions
standardize(TWEET_SCALAR)

print("Descroption of the first tweet :")
print(TWEET_SCALAR[0])
print("Shape of TWEET_SCLALAR :", TWEET_SCALAR.shape)

In [None]:
from descritpors.tweet_label.create_labels import create_labels

# Create the decompositions of the labels
LABEL_ORIGINAL = DATA_SAMPLE[:, 1]
LABEL_STRING = create_strings(LABEL_ORIGINAL, TOKENIZER, OPTIONS)

# -- Get the labels -- #
IMPORTANT_WORDS = create_labels(TWEET_STRING, LABEL_STRING)

# Classification

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
nb_neighbors = 1

knn = KNeighborsRegressor(nb_neighbors, weights="distance")
knn.fit(TWEET_SCALAR, IMPORTANT_WORDS)

In [None]:
PREDICTIONS = knn.predict(TWEET_SCALAR)

print(TWEET_ORIGINAL)
print(TWEET_ORIGINAL.shape)
print(np.array(TWEET_STRING))
print(TWEET_STRING.shape)
print(PREDICTIONS)
print(PREDICTIONS.shape)

In [None]:
from utils.post_processing import pred_to_string

RESULTS = pred_to_string(TWEET_ORIGINAL, TWEET_STRING, PREDICTIONS)

In [None]:
for i in range(len(results)):
    print(DATA[i, 1])
    print(DATA[i, 2])
    print(RESULTS[i])
    print()

In [None]:
from utils.loss import jaccard

AVG = 0
for i in range(len(RESULTS)):
    AVG += jaccard(RESULTS[i], DATA[i, 2])
    
    if jaccard(RESULTS[i], DATA_SAMPLE[i, 2]) != 1:
        print("Error on", i)
AVG /= len(RESULTS)

print(AVG)