In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import WordPunctTokenizer

# Data base import

In [2]:
from utils import sample_data

# -- Get the data -- #
NB_SAMPLES = 10
TRAIN_SAMPLE = Path("../data/samples/sample_{}_train.csv".format(NB_SAMPLES))
VALID_SAMPLE = Path("../data/samples/sample_{}_validation.csv".format(NB_SAMPLES))
TRAIN_SAMPLE = pd.read_csv(TRAIN_SAMPLE).to_numpy()
VALID_SAMPLE = pd.read_csv(VALID_SAMPLE).to_numpy()


# -- Clean the data -- #
from utils.clean_data import clean_data
TRAIN_SAMPLE = clean_data(TRAIN_SAMPLE)
VALID_SAMPLE = clean_data(VALID_SAMPLE)

In [3]:
print(TRAIN_SAMPLE)

[['20951ebfde'
  ' While im stuck INSIDE in Elk Grove Village working all day   Someone should enjoy it!'
  'While im stuck INSIDE in Elk Grove Village working all day   Someone should enjoy it'
  0]
 ['e283379cad'
  'hates the net. ayaw bumukas ng twitter.  http://plurk.com/p/wxlxs'
  'hates the net.' -1]
 ['0bc3d2ce9f'
  'Im feeling so nostalgic. Im sad. But happy. I don`t now how to feel. It`s over, but not at the same time. It just feels over.  i love you.'
  'love' 1]
 ['e316d2f54b' '_in_NH night bud' '_in_NH night bud' 0]
 ['57d554e002' 'time for work!' 'time for work!' 0]
 ['35d95a6bfa'
  'http://twitpic.com/4wqfv - dark berry mocha frapp.. heaven.. TRY IT EVERYONE!!  here.. let me pass it to you'
  'dark berry mocha frapp.. heaven.. TRY IT EVERYONE!!  here.. let me pass it to you'
  1]
 ['a2f6c5cca9'
  'Playing with the munchkin today, talking cakes and getting ready for a yard sale tomorrow. Not looking forward to that'
  'Not looking forward to that' -1]
 ['dcacc9ef8f'
  'The

# Pre-processing
#### TWEET_ORIGINALS : List of the tweets : Array, shape = (len(nb_tweets))
#### TWEET_STRINGS : List of the list of the word of each tweet : List of list of string
#### TWEET_SCALARS : List of the description of each tweet : Array, shape = (len(nb_tweets), sentence_size * word_size)
#### IMPORTANT_WORDS : List of the label of each tweet : Array, shape = (len(nb_tweets), sentence_size)

In [4]:
# -- Parameters -- #
WORD_SIZE = 50  # 50 or 100 or 200 or 300
FILL_WITH = 0  # If a word is not in the dictionary, [0, ..., 0] will describe it.
SENTIMENT_WEIGHT = 2  # Multiply the sentiment by a factor
OPTIONS = [WORD_SIZE, FILL_WITH, SENTIMENT_WEIGHT]

SENTENCE_SIZE = 50  # What ever


# -- Get the original tweets -- #
TWEET_ORIGINALS_TRAIN = TRAIN_SAMPLE[:, 1]
TWEET_ORIGINALS_VALID = VALID_SAMPLE[:, 1]
print("First tweet :")
print(TWEET_ORIGINALS_TRAIN[0])
print("Shape of TWEET_ORIGINAL :", TWEET_ORIGINALS_TRAIN.shape)

First tweet :
 While im stuck INSIDE in Elk Grove Village working all day   Someone should enjoy it!
Shape of TWEET_ORIGINAL : (9,)


In [5]:
from descriptors.tweet_string.create_strings import create_strings
from descriptors.tokenizer.tokenizer import Tokenizer

# Initialize the tokenizer
TOKENIZER = Tokenizer()

# -- Get the decomposition of the tweets -- #
TWEET_STRINGS_TRAIN = create_strings(TWEET_ORIGINALS_TRAIN, TOKENIZER, SENTENCE_SIZE)
TWEET_STRINGS_VALID = create_strings(TWEET_ORIGINALS_VALID, TOKENIZER, SENTENCE_SIZE)
print("Decomposition of the first tweet :")
print(TWEET_STRINGS_TRAIN[0])
print("Length of TWEET_STRING :", len(TWEET_STRINGS_TRAIN))


Decomposition of the first tweet :
['While', 'im', 'stuck', 'INSIDE', 'in', 'Elk', 'Grove', 'Village', 'working', 'all', 'day', 'Someone', 'should', 'enjoy', 'it', '!']
Length of TWEET_STRING : 9


In [6]:
from descriptors.descriptor_glove.descriptor_glove import tweet_scalar_glove
from utils.standardize import standardize


# Get the dictionary
# PATH_DICTIONARY = Path("../data/glove_descriptor/glove.6B.{}d.txt".format(WORD_SIZE))
PATH_DICTIONARY = Path("../data/glove_descriptor/sample_test.txt")
DICTIONARY = pd.read_csv(PATH_DICTIONARY, sep=" ", header=None)

# Additional dictionary
ADDITIONAL_DIC = {"..": "...", "<3": "love"}

# Get the sentiments
SENTIMENTS_TRAIN = TRAIN_SAMPLE[:, -1]
SENTIMENTS_VALID = VALID_SAMPLE[:, -1]

# -- Get the decriptions of each tweets -- #
TWEET_SCALARS_TRAIN = tweet_scalar_glove(TWEET_STRINGS_TRAIN, SENTIMENTS_TRAIN, DICTIONARY, ADDITIONAL_DIC, OPTIONS)
TWEET_SCALARS_VALID = tweet_scalar_glove(TWEET_STRINGS_VALID, SENTIMENTS_VALID, DICTIONARY, ADDITIONAL_DIC, OPTIONS)

# Standardize the tweet descriptions
# standardize(TWEET_SCALARS_TRAIN)
# standardize(TWEET_SCALARS_VALID)

print("Description of the first tweet :")
print(TWEET_STRINGS_TRAIN[0])
print(TWEET_SCALARS_TRAIN[0])
print("Shape of TWEET_SCLALAR :", TWEET_SCALARS_TRAIN.shape)

Description of the first tweet :
['While', 'im', 'stuck', 'INSIDE', 'in', 'Elk', 'Grove', 'Village', 'working', 'all', 'day', 'Someone', 'should', 'enjoy', 'it', '!']
[ 0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.     

In [None]:
from descriptors.tweet_label.create_labels import create_labels

# Create the decompositions of the labels
LABEL_ORIGINALS_TRAIN = TRAIN_SAMPLE[:, 1]
LABEL_ORIGINALS_VALID = VALID_SAMPLE[:, 1]
LABEL_STRINGS_TRAIN = create_strings(LABEL_ORIGINALS_TRAIN, TOKENIZER, SENTENCE_SIZE)
LABEL_STRINGS_VALID = create_strings(LABEL_ORIGINALS_VALID, TOKENIZER, SENTENCE_SIZE)

# -- Get the labels -- #
IMPORTANT_WORDS_TRAIN = create_labels(TWEET_STRINGS_TRAIN, LABEL_STRINGS_TRAIN, SENTENCE_SIZE)
IMPORTANT_WORDS_VALID = create_labels(TWEET_STRINGS_VALID, LABEL_STRINGS_VALID, SENTENCE_SIZE)

IDX = 4
print(TWEET_ORIGINALS_TRAIN[IDX])
print(LABEL_ORIGINALS_TRAIN[IDX])
print("Labels :")
print(IMPORTANT_WORDS_TRAIN[IDX])
print("Shape of IMPORTANT_WORDS :", IMPORTANT_WORDS_TRAIN.shape)

# Classification

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
NB_NEIGHBORS = 1

KNN = KNeighborsRegressor(NB_NEIGHBORS, weights="distance")
KNN.fit(TWEET_SCALARS_TRAIN, IMPORTANT_WORDS_TRAIN)

In [None]:
PREDICTIONS = KNN.predict(TWEET_SCALARS_VALID)

In [None]:
# from utils.post_processing import pred_to_string

# print(type(TWEET_STRINGS_TRAIN[0][0]))
# pred_to_string(TWEET_ORIGINALS_TRAIN[0], TWEET_STRINGS_TRAIN[0], PREDICTIONS[0])

In [None]:
from utils.post_processing import preds_to_strings

RESULTS = preds_to_strings(TWEET_ORIGINALS_VALID, TWEET_STRINGS_VALID, PREDICTIONS)

In [None]:
for i in range(len(RESULTS)):
    print(VALID_SAMPLE[i, 1])
    print(VALID_SAMPLE[i, 2])
    print(RESULTS[i])
    print()

In [None]:
from utils.loss import mean_jaccard

accuracy = mean_jaccard(VALID_SAMPLE[:, 2], RESULTS)
print(accuracy)