In [1]:
import pickle
import numpy as np
import io
import nltk
import pandas as pd

from collections import Counter
from keras.callbacks import ModelCheckpoint
from kutilities.callbacks import MetricsCallback, WeightsCallback, PlottingCallback
from kutilities.helpers.data_preparation import get_labels_to_categories_map, get_class_weights2, onehot_to_categories
from sklearn.metrics import precision_score, accuracy_score, mean_absolute_error
from sklearn.metrics import recall_score
from dataset.data_loader import SemEvalDataLoader
from models.nn_models import target_RNN
from utilities.data_loader import get_embeddings, Task4Loader, prepare_dataset

Using TensorFlow backend.


In [2]:
np.random.seed(1337)  # for reproducibility
WV_CORPUS = "datastories.twitter"
WV_DIM = 50
FINAL = True
POST_MORTEM = True
text_max_length = 50
target_max_length = 6
TASK = "CE"  # Specify the Subtask. It is needed to correctly load the data

In [3]:
embeddings, word_indices = get_embeddings(corpus=WV_CORPUS, dim=WV_DIM)

Loaded 658125 word vectors.


In [4]:
############################################################################
# PERSISTENCE
############################################################################
# if True save model checkpoints, as well as the corresponding word indices
# you HAVE tp set PERSIST = True, in order to be able to use the trained model later
PERSIST = False
best_model = lambda: "cp_model_task4_sub{}.hdf5".format(TASK)
best_model_word_indices = lambda: "cp_model_task4_sub{}_word_indices.pickle".format(
    TASK)

In [5]:
loader = Task4Loader(word_indices,
                     text_lengths=(target_max_length, text_max_length),
                     subtask=TASK)

A
B
Reading twitter - 1grams ...
Reading twitter - 2grams ...


  self.tok = re.compile(r"({})".format("|".join(pipeline)))
  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...
C
Loading data...

D
total observations: 30432
-------------------
training set stats
-------------------
Total: 30432
{'-1': '3381 (11.11%)', '-2': '296 (0.97%)', '0': '12885 (42.34%)', '1': '12854 (42.24%)', '2': '1016 (3.34%)'}
-------------------
F
G


In [6]:
classes = ["-2", "-1", "0", "1", "2"]

In [7]:
# training, validation, testing = loader.load_train_val_test()

In [8]:
if FINAL:
    print("\n > running in FINAL mode!\n")
    training, testing = loader.load_final()
else:
    training, validation, testing = loader.load_train_val_test()



 > running in FINAL mode!


Preparing training set...
Total: 27388
{'-1': '3043 (11.11%)', '-2': '267 (0.97%)', '0': '11596 (42.34%)', '1': '11568 (42.24%)', '2': '914 (3.34%)'}

Preparing test set...
Total: 3044
{'-1': '338 (11.10%)', '-2': '29 (0.95%)', '0': '1289 (42.35%)', '1': '1286 (42.25%)', '2': '102 (3.35%)'}


In [9]:
if POST_MORTEM:
    print("\n > running in Post-Mortem mode!\n")
    gold_data = SemEvalDataLoader().get_gold(task=TASK)
    gX = [obs[1] for obs in gold_data]
    gy = [obs[0] for obs in gold_data]
    gold = prepare_dataset(gX, gy, loader.pipeline, loader.y_one_hot)

    validation = testing
    testing = gold
    FINAL = False


 > running in Post-Mortem mode!


Parsing file: SemEval2017-task4-test.subtask-CE.english.txt done!
done!
Total: 12284
{'-1': '3509 (28.57%)', '-2': '177 (1.44%)', '0': '6146 (50.03%)', '1': '2322 (18.90%)', '2': '130 (1.06%)'}


In [10]:
print("Building NN Model...")
nn_model = target_RNN(embeddings,
                      tweet_max_length=text_max_length,
                      aspect_max_length=target_max_length,
                      noise=0.2,
                      activity_l2=0.001,
                      drop_text_rnn_U=0.2,
                      drop_text_input=0.3,
                      drop_text_rnn=0.3,
                      drop_target_rnn=0.2,
                      use_final=True,
                      bi=True,
                      final_size=64,
                      drop_final=0.5,
                      lr=0.001,
                      rnn_cells=64,
                      attention="context",
                      clipnorm=.1,
                      classes=len(classes))

Building NN Model...
Instructions for updating:
Colocations handled automatically by placer.


  W_regularizer=l2(l2_reg))


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


  model = Model(input=[input_aspect, input_tweet], output=probabilities)


In [11]:
print(nn_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 50)       32906350    input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
gaussian_noise_1 (GaussianNoise (None, 50, 50)       0           embedding_1[0][0]                
__________________________________________________________________________________________________
embedding_

In [12]:
cat_to_class_mapping = {v: int(k) for k, v in
                        get_labels_to_categories_map(classes).items()}

def macro_mae(y_test, y_pred):
    _y_test = [cat_to_class_mapping[y] for y in y_test]
    _y_pred = [cat_to_class_mapping[y] for y in y_pred]

    c = Counter(_y_pred)
    print(c)

    classes = set(_y_test)
    micro_m = {}
    for c in classes:
        class_sentences = [(t, p) for t, p in zip(_y_test, _y_pred) if
                           t == c]
        yt = [y[0] for y in class_sentences]
        yp = [y[1] for y in class_sentences]
        micro_m[c] = mean_absolute_error(yt, yp)
    # pprint.pprint(sorted(micro_m.items(), key=lambda x: x[1], reverse=True))
    return numpy.mean(list(micro_m.values()))


metrics = {
    "macro_mae": macro_mae,
    "micro_mae": (
        lambda y_test, y_pred: mean_absolute_error(y_test, y_pred)),
}

In [13]:
_datasets = {}
_datasets["1-train"] = (training[0], training[1]),
_datasets["2-val"] = (validation[0], validation[1]) if not FINAL else (
    testing[0], testing[1])
if not FINAL:
    _datasets["3-test"] = (testing[0], testing[1])

metrics_callback = MetricsCallback(datasets=_datasets, metrics=metrics)
weights = WeightsCallback(parameters=["W"], stats=["raster", "mean", "std"])

if TASK == "BD":
    plotting = PlottingCallback(grid_ranges=(0.75, 1), height=4,
                                benchmarks={"ρ": 0.797, "α": 0.87})
    checkpointer = ModelCheckpoint(filepath=best_model(), monitor='val.recall',
                                   mode="max", verbose=1, save_best_only=True)
else:
    plotting = PlottingCallback(grid_ranges=(0.4, 1.), height=4,
                                benchmarks={"MAE_M": 0.719, "MAE_m": 0.58})
    checkpointer = ModelCheckpoint(filepath=best_model(),
                                   monitor='val.macro_mae', mode="min",
                                   verbose=1, save_best_only=True)

_callbacks = []
_callbacks.append(metrics_callback)
_callbacks.append(plotting)
_callbacks.append(weights)

In [14]:
class_weights = get_class_weights2(onehot_to_categories(training[1]),
                                       smooth_factor=0.1)

In [15]:
print("Class weights:",
      {cat_to_class_mapping[c]: w for c, w in class_weights.items()})


Class weights: {2: 6.151427469135801, 0: 1.0, 1: 1.0021999434300262, -1: 3.035168705087327, -2: 8.94125893733352}


In [21]:
training[0]

[('iron maiden',
  '"Dude, this new Iron Maiden album is frikken AWESOME to the 10th degree."'),
 ('apple watch',
  "Starting tomorrow, you'll be able to reserve and buy an Apple Watch in stores http://t.co/Yzk3WxhIa8"),
 ('national hot dog day',
  '"Today, Thursday, is National Hot Dog Day and Vanilla Ice Cream Day! Which would you rather go with? Who wants... http://t.co/Pu2i1TPZxG"'),
 ('george osborne',
  'Robbed by George Osborne...while the royals play decoy | Kevin McKenna http://t.co/aDQnPJxNQi'),
 ('ibm',
  'Making a difference in the world! #20 overall and 1st IT company. IBM on the Change the world via @FortuneMagazine http://t.co/0c1JwfSzNg'),
 ('white sox',
  '"Rios beats tag in 8th, Royals sweep White Sox http://t.co/qoJmA74QLA ...it was #StarWars night at The K."'),
 ('moto g',
  '@TELUSsupport Will the patch be delivered together with the 5.1 update for Moto G 1st Gen or separately?'),
 ('galaxy note',
  'Samsung launches Galaxy Note 5 phablet: Samsung on Monday launche

In [20]:
validation[1]

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [23]:
training[0]

[('iron maiden',
  '"Dude, this new Iron Maiden album is frikken AWESOME to the 10th degree."'),
 ('apple watch',
  "Starting tomorrow, you'll be able to reserve and buy an Apple Watch in stores http://t.co/Yzk3WxhIa8"),
 ('national hot dog day',
  '"Today, Thursday, is National Hot Dog Day and Vanilla Ice Cream Day! Which would you rather go with? Who wants... http://t.co/Pu2i1TPZxG"'),
 ('george osborne',
  'Robbed by George Osborne...while the royals play decoy | Kevin McKenna http://t.co/aDQnPJxNQi'),
 ('ibm',
  'Making a difference in the world! #20 overall and 1st IT company. IBM on the Change the world via @FortuneMagazine http://t.co/0c1JwfSzNg'),
 ('white sox',
  '"Rios beats tag in 8th, Royals sweep White Sox http://t.co/qoJmA74QLA ...it was #StarWars night at The K."'),
 ('moto g',
  '@TELUSsupport Will the patch be delivered together with the 5.1 update for Moto G 1st Gen or separately?'),
 ('galaxy note',
  'Samsung launches Galaxy Note 5 phablet: Samsung on Monday launche

In [26]:
Counter([i[0] for i in training[0]])

Counter({'iron maiden': 195,
         'apple watch': 94,
         'national hot dog day': 153,
         'george osborne': 123,
         'ibm': 90,
         'white sox': 207,
         'moto g': 133,
         'galaxy note': 85,
         'john cena': 215,
         'paul mccartney': 181,
         'islam': 248,
         'gucci': 248,
         'brock lesnar': 234,
         'janet jackson': 108,
         'israel': 192,
         'watchman': 109,
         'kane': 248,
         'chris brown': 90,
         'bad blood': 117,
         'disneyland': 94,
         'twilight': 230,
         'kris bryant': 199,
         'rahul gandhi': 164,
         'nicki': 252,
         'justin bieber': 203,
         'paul dunne': 225,
         'david price': 229,
         'lady gaga': 127,
         'milan': 455,
         'charlie hebdo': 173,
         'randy orton': 97,
         'tgif': 88,
         'foo fighters': 250,
         'bobby jindal': 91,
         'frank ocean': 255,
         'snoop dogg': 87,
         'ken

In [28]:
history = nn_model.fit([[i[0] for i in training[0]], [i[1] for i in training[0]]], training[1], 
                       nb_epoch=50, batch_size=64, class_weight=class_weights)

  


ValueError: Error when checking input: expected input_2 to have shape (6,) but got array with shape (1,)