# Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from IPython.display import Image
import matplotlib.pyplot as plt
import functools

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Conv1D, Embedding,Reshape, Concatenate,BatchNormalization,GlobalMaxPooling1D,GlobalAveragePooling1D
from keras.layers.merge import Concatenate
from keras.preprocessing import sequence
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras import regularizers
from keras import metrics
from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import classification_report


# Training Data

In [2]:
data = pd.read_csv('../splited_full_RASFF_DATA.csv', sep=';', 
                header=0, index_col = 0)
data = data.sample(frac = 1)
df = data
df

Unnamed: 0,DATE_CASE,NOT_COUNTRY,PROD_CAT,TYPE,RISK_DECISION,ACTION_TAKEN,DISTRIBUTION_STAT,HAZARDS_CAT,COUNT_ORIGEN,COUNT_DESTIN,COUNT_CONCERN
58400,2016-10-06,Belgium,compound feeds,feed,serious,withdrawal from the market,distribution to other member countries,environmental pollutants,Belgium,Belgium,INFOSAN
143169,2006-10-06,Spain,herbs and spices,food,undecided,re-dispatch,no distribution,food additives and flavourings,China,,Spain
46841,2017-09-22,Netherlands,crustaceans and products thereof,food,serious,informing recipient(s),distribution to other member countries,pathogenic micro-organisms,Netherlands,France,
47784,2017-09-05,Netherlands,poultry meat and poultry meat products,food,serious,withdrawal from recipient(s),distribution to other member countries,pathogenic micro-organisms,Poland,Greece,Netherlands
62155,2016-06-06,Poland,fruits and vegetables,food,undecided,destruction,product not (yet) placed on the market,pesticide residues,Egypt,,Poland
...,...,...,...,...,...,...,...,...,...,...,...
90519,2013-07-19,Belgium,feed additives,feed,undecided,informing recipient(s),distribution to other member countries,residues of veterinary medicinal products,India,Ukraine,Commission Services
26939,2018-12-14,United Kingdom,pet food,feed,not serious,recall from consumers,distribution to other member countries,poor or insufficient controls,United Kingdom,Ireland,
67841,2015-11-17,Belgium,feed materials,feed,serious,,no distribution from notifying country,composition,Ukraine,Belgium,INFOSAN
112032,2011-03-28,Greece,cereals and bakery products,food,undecided,recall from consumers,information on distribution not (yet) available,foreign bodies,Italy,Luxembourg,


# Basic Pre-processing

In [3]:
display(len(df))
df['HAZARDS_CAT'] = df['HAZARDS_CAT'].astype(str)
df['DATE_CASE'] = df['DATE_CASE'].astype(str)
df['DATE_CASE'] = pd.to_datetime(df.DATE_CASE, errors='coerce')
df['DATE_CASE'] = df.DATE_CASE.dt.month
df = df.dropna(subset=['DATE_CASE'])
display(len(df))

168188

168187

# Features Selection

In [4]:
features = [0,1,6,8]
target = [2]
X = df.iloc[:,features]
y = df.iloc[:,target]

In [5]:
X

Unnamed: 0,DATE_CASE,NOT_COUNTRY,DISTRIBUTION_STAT,COUNT_ORIGEN
58400,10.0,Belgium,distribution to other member countries,Belgium
143169,10.0,Spain,no distribution,China
46841,9.0,Netherlands,distribution to other member countries,Netherlands
47784,9.0,Netherlands,distribution to other member countries,Poland
62155,6.0,Poland,product not (yet) placed on the market,Egypt
...,...,...,...,...
90519,7.0,Belgium,distribution to other member countries,India
26939,12.0,United Kingdom,distribution to other member countries,United Kingdom
67841,11.0,Belgium,no distribution from notifying country,Ukraine
112032,3.0,Greece,information on distribution not (yet) available,Italy


In [6]:
y

Unnamed: 0,PROD_CAT
58400,compound feeds
143169,herbs and spices
46841,crustaceans and products thereof
47784,poultry meat and poultry meat products
62155,fruits and vegetables
...,...
90519,feed additives
26939,pet food
67841,feed materials
112032,cereals and bakery products


In [7]:
ency = OneHotEncoder(handle_unknown='ignore', sparse = False)
ency.fit(y.values)
y_one_hot = ency.transform(y.values)

# Test Data

## Loaded

In [8]:
#Este dataset que carga aquí no se usa nunca. Se sobreescriben sus variables.
'''
data = pd.read_csv('../splited_2019_RASFF_DATA_16092019.csv', sep=';', 
                header=0, index_col = 0)
data = data.sample(frac = 1)
test = data
test
'''

"\ndata = pd.read_csv('../splited_2019_RASFF_DATA_16092019.csv', sep=';', \n                header=0, index_col = 0)\ndata = data.sample(frac = 1)\ntest = data\ntest\n"

## Preprocessing

In [9]:
'''
test['HAZARDS_CAT'] = test['HAZARDS_CAT'].astype(str)
test['DATE_CASE'] = test['DATE_CASE'].astype(str)
test['DATE_CASE'] =pd.to_datetime(test.DATE_CASE)
test['DATE_CASE'] = test.DATE_CASE.dt.month
'''

"\ntest['HAZARDS_CAT'] = test['HAZARDS_CAT'].astype(str)\ntest['DATE_CASE'] = test['DATE_CASE'].astype(str)\ntest['DATE_CASE'] =pd.to_datetime(test.DATE_CASE)\ntest['DATE_CASE'] = test.DATE_CASE.dt.month\n"

## Features

In [10]:
features = [0,1,6,8]
target = [2]
'''
X_val = test.iloc[:,features]
y_val = test.iloc[:,target]
'''

'\nX_val = test.iloc[:,features]\ny_val = test.iloc[:,target]\n'

In [11]:
#y_val_one_hot = ency.transform(y_val.values)

# Split train-test-val

In [12]:
categoical_vars = ['DATE_CASE','NOT_COUNTRY','DISTRIBUTION_STAT','COUNT_ORIGEN']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42,shuffle = True)

# Coding and conversion to lists for being able to introduce it into the model

In [15]:
def preproc(X_train, X_test, Xvalidation_data):
    input_list_train = []
    input_list_test = []
    input_list_testval = []
    
    for c in categoical_vars:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
        input_list_testval.append(X_val[c].map(val_map).fillna(0).values)

    return input_list_train, input_list_test,input_list_testval

In [16]:
input_list_train,input_list_test,input_list_testval = preproc(X_train,X_test, X_val)

# Metrics Definition

In [17]:
top3_acc = functools.partial(metrics.top_k_categorical_accuracy, k=3)
top3_acc.__name__ = 'top3_acc'

top2_acc = functools.partial(metrics.top_k_categorical_accuracy, k=3)
top2_acc.__name__ = 'top2_acc'

In [18]:
def top_1_categorical_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=1) 

In [19]:
def top_2_categorical_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=2) 

In [20]:
def top_3_categorical_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=3) 

# Embeddings + mlp Models (cases 1 and 3)

In [21]:
input_models=[]
output_embeddings=[]

for categorical_var in categoical_vars:
    cat_emb_name= categorical_var.replace(" ", "")+'_Embedding'
    input_name= 'Input_' + categorical_var.replace(" ", "")
    no_of_unique_cat  = X_train[categorical_var].nunique()
    embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50 ))
   
    input_model = Input(shape=(1,), name=input_name)
    output_model = Embedding(no_of_unique_cat, embedding_size, name=cat_emb_name)(input_model)
    output_model = Reshape(target_shape=(embedding_size,))(output_model)    
    
    input_models.append(input_model)
    output_embeddings.append(output_model)
  

output = Concatenate()(output_embeddings)
output = Dense(2048,activation="relu")(output)
output= Dropout(0.3)(output)
output = Dense(1024,activation="relu")(output)
output= Dropout(0.2)(output)
output = Dense(512,activation="relu")(output)
output= Dropout(0.2)(output)
output = Dense(38, activation='softmax')(output)
model = Model(inputs=input_models, outputs=output)

In [22]:
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy', top_1_categorical_accuracy,top_2_categorical_accuracy,top_3_categorical_accuracy])

In [23]:
display(np.array(input_list_train).shape)
display(y_train.shape)
display(np.array(input_list_test).shape)
display(y_test.shape)


(4, 100911)

(100911, 42)

(4, 33638)

(33638, 42)

In [24]:
 hist = model.fit(input_list_train,y_train,validation_data=(input_list_test,y_test) , epochs = 5 , batch_size = 64, verbose=1)

Epoch 1/5


ValueError: in user code:

    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:533 train_step  **
        y, y_pred, sample_weight, regularization_losses=self.losses)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\compile_utils.py:205 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\losses.py:143 __call__
        losses = self.call(y_true, y_pred)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\losses.py:246 call
        return self.fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\losses.py:1527 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\backend.py:4561 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\Nacho Moll\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\framework\tensor_shape.py:1117 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 42) and (None, 38) are incompatible


In [None]:
model.summary()

## Mean Efficiencies

In [None]:
historials = []
evaluations = []
for i in range (1,6):
    model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy', top_1_categorical_accuracy,top_2_categorical_accuracy,top_3_categorical_accuracy])
    hist = model.fit(input_list_train,y_train,validation_data=(input_list_test,y_test) , epochs = 25 , batch_size = 64, verbose=0)
    historials.append(hist)
    #evaluation = model.evaluate(x = input_list_testval, y =  y_val_one_hot)
    evaluation = model.evaluate(x = input_list_testval, y =  y_val)
    evaluations. append(evaluation)
    model.save ("model"+str(i)+".h5")

In [None]:
suma = 0
for i in evaluations:
    suma = suma + i[2]
print(suma/5)

In [None]:
model.evaluate(x = input_list_testval, y = y_val)

## Training Charts

In [None]:
model.metrics_names

In [None]:
plt.plot(hist.history['loss'], label = 'loss') 
plt.plot(hist.history['val_loss'], label = 'val_loss') 
plt.legend()
plt.show()


plt.plot(hist.history['acc'], label = 'acc') 
plt.plot(hist.history['val_top_1_categorical_accuracy'], label = 'val_acc') 
plt.legend()
plt.show()

## Embeddings Representation Test

In [None]:
embeddings = model.layers[14].get_weights()[0]
embeddings

In [None]:
embeddings[:,1]

In [None]:
import matplotlib.pyplot as plt
plt.scatter(embeddings[:,0],embeddings[:,1]) 
plt.show()

In [None]:
y = embeddings[:,0]
z = embeddings[:,1]
n = [6, 4, 1, 10,8,5,11,3,12,7,2,9]

fig, ax = plt.subplots()
ax.scatter(z, y)

for i, txt in enumerate(n):
    ax.annotate(txt, (z[i], y[i]))

## Models Figure 

In [None]:
plot_model(model, to_file='model3.png',show_shapes=True,show_layer_names=True)
Image(retina=True, filename='model3.png')

## Predictions and inverse transformations (converting the numerical prediction to the predicted category)

In [None]:
prediction = model.predict(input_list_test)
prediction[0]

In [None]:
pred = np.around(prediction[12],decimals = 2)
pred

In [None]:
np.where(pred == 0.97)

In [None]:
pred[25] = 1

In [None]:
ency.inverse_transform(np.around(pred.reshape(1, -1),decimals = 3)) #Prediction

In [None]:
ency.inverse_transform(np.around(y_test[25].reshape(1, -1),decimals = 1)) #Reality

## Grid search tests

In [None]:
def create_model(activation = 'relu' ):
    input_models=[]
    output_embeddings=[]

    for categorical_var in categoical_vars:
        
        cat_emb_name= categorical_var.replace(" ", "")+'_Embedding'
        input_name= 'Input_' + categorical_var.replace(" ", "")
        no_of_unique_cat  = X_train[categorical_var].nunique()
        embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50 ))
        input_model = Input(shape=(1,), name=input_name)
        output_model = Embedding(no_of_unique_cat, embedding_size, name=cat_emb_name)(input_model)
        output_model = Reshape(target_shape=(embedding_size,))(output_model)    
        input_models.append(input_model)
        output_embeddings.append(output_model)



    output = Concatenate()(output_embeddings)
    output = Dense(2500,activation=activation)(output)
    output= Dropout(0.5)(output)
    output = Dense(1500,activation=activation)(output)
    output= Dropout(0.4)(output)
    output = Dense(1000,activation=activation)(output)
    output= Dropout(0.3)(output)
    output = Dense(38, activation='softmax')(output)
    model = Model(inputs=input_models, outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['categorical_accuracy'])
    return model

In [None]:
model = KerasClassifier(build_fn=create_model, verbose=1,batch_size = 512, epochs = 1000)

In [None]:
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(activation = activation)

In [None]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=2)
grid_result = grid.fit(input_list_train,y_train,validation_data=(input_list_test,y_test))

# Embeddings Model + conv (case 2)

In [None]:
input_models=[]
output_embeddings=[]

for categorical_var in categoical_vars:
    
    cat_emb_name= categorical_var.replace(" ", "")+'_Embedding'
    input_name= 'Input_' + categorical_var.replace(" ", "")
    no_of_unique_cat  = X_train[categorical_var].nunique()
    embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50 ))
    input_model = Input(shape=(1,), name=input_name)
    output_model = Embedding(no_of_unique_cat, embedding_size, name=cat_emb_name)(input_model)
    output_model = Reshape(target_shape=(embedding_size,))(output_model)    
    input_models.append(input_model)
    output_embeddings.append(output_model)

output = Concatenate()(output_embeddings)

output = Reshape(input_shape=(100,), target_shape=(100, 1))(output)


output = Conv1D(filters=128,kernel_size=4, activation = "relu")(output)
output = Conv1D(filters=128,kernel_size=4, activation = "relu")(output)
output = BatchNormalization()(output)
output = MaxPooling1D(pool_size=2)(output)

output = Conv1D(filters=256,kernel_size=3, activation = "relu")(output)
output = Conv1D(filters=256,kernel_size=3, activation = "relu")(output)
output = BatchNormalization()(output)
output = GlobalMaxPooling1D()(output)


output = Dense(512, activation = "relu")(output)

output = Dense(256, activation = "relu")(output)

output = Dense(35, activation='softmax')(output)

model = Model(inputs=input_models, outputs=output)


model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=[top_1_categorical_accuracy,top_2_categorical_accuracy,top_3_categorical_accuracy])

In [None]:
model.summary()

In [None]:
plot_model(model, to_file='model2.png',show_shapes=True,show_layer_names=True)
Image(retina=True, filename='model2.png')

In [None]:
hist = model.fit(input_list_train,y_train,validation_data=(input_list_test,y_test) , epochs =  25, batch_size = 64, verbose= 1)

In [None]:
model.evaluate(x = input_list_testval, y = y_val)

In [None]:
plt.plot(hist.history['loss'], label = 'loss') 
plt.plot(hist.history['val_loss'], label = 'val_loss') 
plt.legend()
plt.show()


plt.plot(hist.history['categorical_accuracy'], label = 'acc') 
plt.plot(hist.history['val_categorical_accuracy'], label = 'val_acc') 
plt.legend()
plt.show()