In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv("Japan_Not_Encoded.csv", encoding='utf-8')
data.head()

Unnamed: 0,COMPANY,COUNTRY,DISTRIBUTOR,ARTICLE ID,MODEL NUMBER,ARTICLE NAME,SUBBRAND,SPORTS CATEGORY,PRODUCT DIVISION,PRODUCT GROUP,...,CURRENT PRICE IN EUR,INITIAL PRICE IN SELECTED CURRENCY,CURRENT PRICE IN SELECTED CURRENCY,SELECTED CURRENCY,PRODUCT INTRODUCTION DATE,DISCOUNTED SINCE,PRODUCT EXIT DATE,PRODUCT DESCRIPTION,PRODUCT URL,IMAGE-SERVER URL
0,Nike,Japan,Own eCom,BG0387-001,BG0387,ナイキ コア ハーフ KV ゴルフバッグ,Performance,Golf,Accessories,Bags,...,99.144,116.64,99.144,EUR,2/22/2016,12/26/2016,1/2/2017,整理しやすいゴルフバッグ。快適な持ち運び。 ナイキ コア ハーフ KV ゴルフバッグは、専用...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
1,Nike,Japan,Own eCom,839240-001,839240,ナイキ コルテッツ QS キッズシューズ,Sport Inspired,Lifestyle,Footwear,Sport Inspired Footwear,...,62.82,82.62,62.82,EUR,2/22/2016,4/5/2016,7/26/2016,高級感のあるレトロスタイル ナイキ コルテッツ QS キッズシューズは、上質なレザーのア...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
2,Nike,Japan,Own eCom,GL0783-101,GL0783,ナイキ レジン スピード レッド ゴルフボール,Performance,Golf,Accessories,Sport Equipment,...,31.104,31.104,31.104,EUR,2/15/2016,Not discounted yet,8/29/2016,高初速でより遠くへ ナイキ レジン スピード レッド ゴルフボールは、更にソフトになった新開...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
3,Nike,Japan,Own eCom,GL0781-101,GL0781,ナイキ レジン ツアー ブラック ゴルフボール,Performance,Golf,Accessories,Sport Equipment,...,28.764,58.32,28.764,EUR,2/15/2016,2/27/2017,6/27/2017,低スピンでより遠くへ ナイキ レジン ツアー ブラック ゴルフボールは、更にソフトになったR...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...
4,Nike,Japan,Own eCom,AC3877-005,AC3877,ナイキ ATG スピード ジャンプ ロープ,Performance,Training,Accessories,Sport Equipment,...,28.188,28.188,28.188,EUR,2/15/2016,Not discounted yet,7/19/2016,軽く、速く、カスタマイズも可能。 ナイキ ATG スピード ジャンプ ロープは、滑りにくいボ...,http://store.nike.com/jp/ja_jp/pd/%25E3%2583%2...,http://usporamap287.am.adsint.biz/zoomimages/1...


In [3]:
data.columns

Index(['COMPANY', 'COUNTRY', 'DISTRIBUTOR', 'ARTICLE ID', 'MODEL NUMBER',
       'ARTICLE NAME', 'SUBBRAND', 'SPORTS CATEGORY', 'PRODUCT DIVISION',
       'PRODUCT GROUP', 'PRODUCT TYPE', 'FRANCHISE', 'TECHNOLOGIES',
       'COLOUR GROUP', 'COLOUR', 'GENDER', 'AGE GROUP', 'CONSUMER RATING',
       'INITIAL PRICE IN LOCAL CURRENCY', 'CURRENT PRICE IN LOCAL CURRENCY',
       'LOCAL CURRENCY', 'INITIAL PRICE IN EUR', 'CURRENT PRICE IN EUR',
       'INITIAL PRICE IN SELECTED CURRENCY',
       'CURRENT PRICE IN SELECTED CURRENCY', 'SELECTED CURRENCY',
       'PRODUCT INTRODUCTION DATE', 'DISCOUNTED SINCE', 'PRODUCT EXIT DATE',
       'PRODUCT DESCRIPTION', 'PRODUCT URL', 'IMAGE-SERVER URL'],
      dtype='object')

In [3]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [4]:
data_req = data.loc[:,["COMPANY", "COUNTRY", "ARTICLE NAME","SUBBRAND", "PRODUCT DESCRIPTION", "PRODUCT URL"]]

In [5]:
data_req = data_req.drop_duplicates()

In [6]:
data_req["description"]  = data_req["COMPANY"]  +" " + data_req["ARTICLE NAME"] +" " + data_req["PRODUCT DESCRIPTION"]

In [7]:
data_req.columns

Index(['COMPANY', 'COUNTRY', 'ARTICLE NAME', 'SUBBRAND', 'PRODUCT DESCRIPTION',
       'PRODUCT URL', 'description'],
      dtype='object')

In [8]:
data_req = data_req.drop(["COUNTRY", "ARTICLE NAME", "PRODUCT DESCRIPTION", "PRODUCT URL"], axis = 1)
data_req.columns = ["COMPANY", "subBrand", "description"]

In [9]:
data_req.description = data_req.description.fillna("unknown")

In [10]:
data_req["subBrand"] = data_req["subBrand"].str.lower()
data_req.description = data_req.description.str.lower()

In [11]:
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data_req["subBrand"].fillna("unknown"))

In [12]:
X = ["description", "COMPANY"]

In [13]:
data_req["subBrand"].value_counts()

performance       33771
sport inspired    27764
Name: subBrand, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(data_req[X], y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [15]:
# load the GloVe vectors in a dictionary:

# embeddings_index = {}
# f = open('D:/GPMT/fasttext/cc.de.300.vec', encoding="utf-8")
# for line in tqdm(f):
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

2000001it [04:19, 7693.77it/s]


Found 2000000 word vectors.


In [19]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
chk = le.fit_transform(data["SPORTS CATEGORY"])

In [100]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 100

token.fit_on_texts(list(xtrain.description) + list(xvalid.description))
xtrain_seq = token.texts_to_sequences(xtrain.description)
xvalid_seq = token.texts_to_sequences(xvalid.description)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

#word_index = token.word_index

In [101]:
xtrain_pad

array([[    0,     0,     0, ...,   493,    18, 49215],
       [    0,     0,     0, ...,     0,     0,    14],
       [    0,     0,     0, ...,    12,    11,    13],
       ...,
       [    0,     0,     0, ..., 45219,     9,    29],
       [    0,     0,     0, ...,  2324, 70057,  2324],
       [    0,     0,     0, ...,     0,     0,    14]])

In [57]:
xtrain_seq

[[16402], [841]]

In [44]:
# create an embedding matrix for the words we have in the dataset
#embeddings_index = {}
# embedding_matrix = np.zeros((len(word_index) + 1, 300))
# for word, i in tqdm(word_index.items()):
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

100%|███████████████████████████████████████████████████████████████████████| 126948/126948 [00:04<00:00, 31223.79it/s]


In [20]:
embedding_matrix.shape

(126949, 300)

In [22]:
import gensim



In [83]:
import tinysegmenter
segmenter = tinysegmenter.TinySegmenter()

In [84]:
#word_chk = segmenter.tokenize(list(xtrain.description[2]))
tokenized_text = [segmenter.tokenize(x) for x in xtrain.description ]

In [119]:
word_model = gensim.models.Word2Vec(tokenized_text, size=300, min_count=1, window=5, iter=1)

In [78]:
# from nltk import word_tokenize
# tokenized_text = [word_tokenize(x) for x in xtrain.description ]

In [81]:
# word_model = gensim.models.Word2Vec(tokenized_text, size=100, min_count=1, window=5, iter=5)

In [120]:
word_model.wv.vocab

{'nike': <gensim.models.keyedvectors.Vocab at 0x27d21e16198>,
 ' ': <gensim.models.keyedvectors.Vocab at 0x27d21e16da0>,
 'ナイキ': <gensim.models.keyedvectors.Vocab at 0x27d21f59d68>,
 'エピック': <gensim.models.keyedvectors.Vocab at 0x27d21f59e10>,
 'ラックス': <gensim.models.keyedvectors.Vocab at 0x27d21f59e80>,
 'ウィメンズ': <gensim.models.keyedvectors.Vocab at 0x27d21f59ef0>,
 'ランニングタンクトップ': <gensim.models.keyedvectors.Vocab at 0x27d21f59f60>,
 'は': <gensim.models.keyedvectors.Vocab at 0x27d21f59f98>,
 '、': <gensim.models.keyedvectors.Vocab at 0x27d21f59fd0>,
 '速乾': <gensim.models.keyedvectors.Vocab at 0x27d21f59cf8>,
 '性': <gensim.models.keyedvectors.Vocab at 0x27d21f59d30>,
 'に': <gensim.models.keyedvectors.Vocab at 0x27d21f59da0>,
 '優れ': <gensim.models.keyedvectors.Vocab at 0x27d21f59dd8>,
 'た': <gensim.models.keyedvectors.Vocab at 0x27d21f592b0>,
 'コンプレッション': <gensim.models.keyedvectors.Vocab at 0x27d2ba36048>,
 '素材': <gensim.models.keyedvectors.Vocab at 0x27d2ba36080>,
 'を': <gensim.models.

In [74]:
# from gensim.test.utils import common_texts
# from gensim.models import Word2Vec
# model = Word2Vec(common_texts[1], size=100, window=5, min_count=1, workers=4)

In [77]:
# type(common_texts)

list

In [122]:
# create an embedding matrix for the words we have in the dataset
#embeddings_index = {}
embedding_matrix = np.zeros((len(word_model.wv.vocab)+1, 300))
for i in range(len(word_model.wv.vocab)):
    embedding_vector = word_model.wv[word_model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [123]:
embedding_matrix

array([[-8.34841073e-01, -1.22127086e-01,  6.68775797e-01, ...,
         2.57656127e-01,  4.58594590e-01,  7.16453910e-01],
       [ 4.67244804e-01, -3.30790371e-01,  5.16118348e-01, ...,
         1.11972041e-01, -4.24341589e-01,  5.47882617e-01],
       [ 1.33656454e+00, -3.56277287e-01,  1.51411057e+00, ...,
        -1.59505308e+00, -9.75202844e-02, -2.73133218e-01],
       ...,
       [ 2.82297289e-04,  8.22886534e-04,  8.22812086e-04, ...,
        -9.80065204e-04,  1.19583026e-04,  4.64186800e-04],
       [ 6.39223435e-04, -9.90046188e-04,  9.84076993e-04, ...,
         1.40132196e-03, -1.04523997e-03, -2.26449061e-04],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [116]:
#word_model.wv.vocab
max_len

100

In [127]:
from keras import optimizers

# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_model.wv.vocab) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=True))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.4))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(2))
model.add(Activation('softmax'))

#sgd = optimizers.SGD(lr=0.00001, decay=1e-5, momentum=0.8, nesterov=True)
rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer= rmsprop)

In [42]:
embed = Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False)
embed

<keras.layers.embeddings.Embedding at 0x268f3ec5550>

In [129]:
from keras.layers import Flatten
from keras.layers import Concatenate

embed = Sequential([Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False), SpatialDropout1D(0.3), LSTM(100, dropout=0.3, recurrent_dropout=0.3)])

#flatten = Flatten()(embed)

#dropout1 = SpatialDropout1D(0.3)(embed)

#lstm = LSTM(100, dropout=0.3, recurrent_dropout=0.3)(flatten)

agei = Input(shape=(len(chk),))

conc = Concatenate()([embed, agei],axis = 0)

dens1 = Dense(1024, activation='relu')(conc)
dropout2 = Dropout(0.8)(dens1)

dens2 = Dense(1024, activation='relu')(dens1)
dropout3 = Dropout(0.8)(dens2)

dens3 = Dense(512, activation='relu')(dropout3)
dropout4 = Dropout(0.4)(dens3)

dens4 = Dense(256, activation='relu')(dropout4)
dropout5 = Dropout(0.3)(dens4)

dens5 = Dense(128, activation='relu')(dropout5)
dropout6 = Dropout(0.3)(dens5)

dens6 = Dense(2)(dropout6)

acti = Activation('softmax')(dens6)

model = model.add([embed, chk], acti)

NameError: name 'word_index' is not defined

In [None]:
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

# Headline input: meant to receive sequences of 100 integers, between 1 and 10000.
# Note that we can name any layer by passing it a "name" argument.
main_input = Input(shape=(len(chk),), dtype='int32', name='main_input')

# This embedding layer will encode the input sequence
# into a sequence of dense 512-dimensional vectors.
x = Embedding(output_dim=Embedding(len(word_index) + 1, weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False)(main_input)

# A LSTM will transform the vector sequence into a single vector,
# containing information about the entire sequence
lstm_out = LSTM(32)(x)

In [112]:
xvalid_pad

array([[    0,     0,     0, ...,     5,  3983,  2974],
       [    0,     0,     0, ...,  5195,  1098,  6425],
       [    0,     0,     0, ...,    99,     7,   186],
       ...,
       [    0,     0,     0, ...,    74,   482,    77],
       [    0,     0,     0, ...,    21,     9,    52],
       [    0,     0,     0, ...,  1656,   235, 36044]])

In [130]:
model.fit(xtrain_pad, y=ytrain_enc, batch_size=128, epochs=10, verbose=1, validation_data=(xvalid_pad, yvalid_enc))

Train on 55381 samples, validate on 6154 samples
Epoch 1/10


InvalidArgumentError: indices[13,99] = 39422 is not in [0, 36236)
	 [[Node: embedding_8/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training_4/RMSprop/Assign_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_8/embeddings/read, embedding_8/Cast, training_4/RMSprop/gradients/embedding_8/embedding_lookup_grad/concat/axis)]]

In [None]:
predictions = model.predict(xvalid_pad)

In [None]:
predict_label = [np.argmax(i) for i in predictions]

In [None]:
predict_label = lbl_enc.inverse_transform(predict_label)

In [None]:
yvalid_label = lbl_enc.inverse_transform(yvalid)

In [None]:
test_df = pd.DataFrame({'description' : xvalid, 'actuals' : yvalid_label, 'predictions' : predict_label})

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [None]:
f1_score(test_df.actuals, test_df.predictions,average="macro")

In [None]:
accuracy_score(test_df.actuals, test_df.predictions)