In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import keras
from keras.models import Model, Sequential
from keras.utils import np_utils
from keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import sequence, text
import gensim
import re, string
import tinysegmenter
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv("Japan_Not_Encoded.csv", encoding='utf-8')
data.head()

Unnamed: 0,COMPANY,COUNTRY,DISTRIBUTOR,ARTICLE ID,MODEL NUMBER,ARTICLE NAME,SUBBRAND,SPORTS CATEGORY,PRODUCT DIVISION,PRODUCT GROUP,...,CURRENT PRICE IN EUR,INITIAL PRICE IN SELECTED CURRENCY,CURRENT PRICE IN SELECTED CURRENCY,SELECTED CURRENCY,PRODUCT INTRODUCTION DATE,DISCOUNTED SINCE,PRODUCT EXIT DATE,PRODUCT DESCRIPTION,PRODUCT URL,IMAGE-SERVER URL
0,Nike,Japan,Own eCom,BG0387-001,BG0387,ナイキ コア ハーフ KV ゴルフバッグ,Performance,Golf,Accessories,Bags,...,99.144,116.64,99.144,EUR,2/22/2016,12/26/2016,1/2/2017,整理しやすいゴルフバッグ。快適な持ち運び。 ナイキ コア ハーフ KV ゴルフバッグは、専用...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
1,Nike,Japan,Own eCom,839240-001,839240,ナイキ コルテッツ QS キッズシューズ,Sport Inspired,Lifestyle,Footwear,Sport Inspired Footwear,...,62.82,82.62,62.82,EUR,2/22/2016,4/5/2016,7/26/2016,高級感のあるレトロスタイル ナイキ コルテッツ QS キッズシューズは、上質なレザーのア...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
2,Nike,Japan,Own eCom,GL0783-101,GL0783,ナイキ レジン スピード レッド ゴルフボール,Performance,Golf,Accessories,Sport Equipment,...,31.104,31.104,31.104,EUR,2/15/2016,Not discounted yet,8/29/2016,高初速でより遠くへ ナイキ レジン スピード レッド ゴルフボールは、更にソフトになった新開...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
3,Nike,Japan,Own eCom,GL0781-101,GL0781,ナイキ レジン ツアー ブラック ゴルフボール,Performance,Golf,Accessories,Sport Equipment,...,28.764,58.32,28.764,EUR,2/15/2016,2/27/2017,6/27/2017,低スピンでより遠くへ ナイキ レジン ツアー ブラック ゴルフボールは、更にソフトになったR...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
4,Nike,Japan,Own eCom,AC3877-005,AC3877,ナイキ ATG スピード ジャンプ ロープ,Performance,Training,Accessories,Sport Equipment,...,28.188,28.188,28.188,EUR,2/15/2016,Not discounted yet,7/19/2016,軽く、速く、カスタマイズも可能。 ナイキ ATG スピード ジャンプ ロープは、滑りにくいボ...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...


In [3]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [4]:
data_req = data.loc[:,["COMPANY", "COUNTRY", "ARTICLE NAME","SUBBRAND", "PRODUCT DESCRIPTION", "PRODUCT URL"]]

In [5]:
data_req = data_req.drop_duplicates()

In [6]:
data_req["description"]  = data_req["COMPANY"] +" " + data_req["ARTICLE NAME"] +" " + data_req["PRODUCT DESCRIPTION"]

In [7]:
data_req = data_req.drop(["COUNTRY", "ARTICLE NAME", "PRODUCT DESCRIPTION", "PRODUCT URL"], axis = 1)
data_req.columns = ["COMPANY", "subBrand", "description"]

In [8]:
data_req.description = data_req.description.fillna("unknown")

In [9]:
data_req["subBrand"] = data_req["subBrand"].str.lower()
data_req.description = data_req.description.str.lower()

In [10]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data_req["subBrand"].fillna("unknown"))

In [11]:
X = ["description", "COMPANY"]

In [12]:
data_req["subBrand"].value_counts()

performance       33771
sport inspired    27764
Name: subBrand, dtype: int64

In [13]:
xtrain, xvalid, ytrain, yvalid = train_test_split(data_req[X], y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [14]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿|¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s)

In [15]:
texts_train = [tokenize(x) for x in xtrain.description]
texts_valid = [tokenize(x) for x in xvalid.description]

In [16]:
segmenter = tinysegmenter.TinySegmenter()
tokenized_text_train = [segmenter.tokenize(x) for x in texts_train]
tokenized_text_valid = [segmenter.tokenize(x) for x in texts_valid]

In [17]:
joined_token_train = [' '.join(x) for x in tokenized_text_train]
joined_token_valid = [' '.join(x) for x in tokenized_text_valid]

In [18]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 300

token.fit_on_texts(joined_token_train + joined_token_valid)
xtrain_seq = token.texts_to_sequences(joined_token_train)
xvalid_seq = token.texts_to_sequences(joined_token_valid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [19]:
token_keras_words = list(token.word_index)

In [20]:
from keras.preprocessing.text import text_to_word_sequence
keras_tokenised_words = [text_to_word_sequence(x, lower=False) for x in (joined_token_train + joined_token_valid)]

In [21]:
word_model = gensim.models.Word2Vec(keras_tokenised_words, size=300, min_count=1, window=5, iter=1)

In [22]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = word_model.wv[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|████████████████████████████████████████████████████████████████████████| 36954/36954 [00:00<00:00, 147191.20it/s]


In [23]:
import keras
from keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate, Dropout
from keras.models import Model

# Input: meant to receive sequences of 300 integers
main_input = Input(shape=(300,), name='main_input')

# This embedding layer will encode the input sequence
# into a sequence of dense 300-dimensional vectors.
x = Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False)(main_input)

x = SpatialDropout1D(0.3)(x)


# A LSTM will transform the vector sequence into a single vector,
# containing information about the entire sequence
lstm_out = LSTM(100, dropout=0.3, recurrent_dropout=0.3)(x)

In [24]:
auxiliary_input = Input((1,), name='aux_input')
x = keras.layers.concatenate([lstm_out, auxiliary_input], axis = 1)

# We stack a deep densely-connected network on top
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.4)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)

# And finally we add the main logistic regression layer
main_output = Dense(2, activation='softmax', name='main_output')(x)

In [25]:
model = Model(inputs=[main_input, auxiliary_input], outputs=main_output)
rmsprop = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer= 'adam')

In [26]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [27]:
le = preprocessing.LabelEncoder()
xtrain_company = np.array(xtrain.COMPANY)
xtrain_company = le.fit_transform(xtrain_company)

In [32]:
xvalid_company = np.array(xvalid.COMPANY)
xvalid_company = le.fit_transform(xvalid_company)

In [28]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     11086500    main_input[0][0]                 
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 300, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 100)          160400      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
aux_input 

In [29]:
model.fit(x = [xtrain_pad, xtrain_company] , y=ytrain_enc, batch_size=256, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2




<keras.callbacks.History at 0x2498b2acc50>

In [30]:
xtrain_company.shape

(55381,)

In [33]:
predictions = model.predict([xvalid_pad, xvalid_company])

In [34]:
predict_label = [np.argmax(i) for i in predictions]

In [35]:
predict_label = lbl_enc.inverse_transform(predict_label)

  if diff:


In [36]:
yvalid_label = lbl_enc.inverse_transform(yvalid)

  if diff:


In [37]:
test_df = pd.DataFrame({'description' : xvalid.description, 'actuals' : yvalid_label, 'predictions' : predict_label})

In [39]:
f1_score(test_df.actuals, test_df.predictions,average="macro")

0.8347948792989917

In [40]:
accuracy_score(test_df.actuals, test_df.predictions)

0.8378290542736432