In [1]:
import os
import math
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Setup

In [2]:
train_df = pd.read_csv("../input/ndsc-beginner/train.csv")
train_df["Supercategory"] = train_df["image_path"].str[0]
train_df = train_df.sample(frac=1.)
val_df = train_df[:400]
val_df.head()

Unnamed: 0,itemid,title,Category,image_path,Supercategory
322769,1131017203,gaun maxi wanita dengan model split dan kerah ...,18,fashion_image/67394a11a1ad9071dcf84faa1c4f24a9,f
475539,1303669217,becca basic top atasan katun polos,26,fashion_image/bb04188452add6436f27db04e60677fe,f
56357,1704236768,terbaru clinique derma white clarifying bright...,5,beauty_image/f902bbe87b06c7e0f93b1d4b7435f5ff.jpg,b
276435,1283523346,kylie cosmetics koko kollection matte liquid l...,12,beauty_image/52996c931a0550d56315e7e7047e75be.jpg,b
451707,1741654741,95 cotton embroidery t shirt female short slee...,26,fashion_image/65e18a9c99578f7f95c697ce9da3a173,f


In [3]:
# Embdedding setup, save it in a dictionary for easier queries
embeddings_index = {}
f = open('../input/glove840b300dtxt/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196018it [03:25, 10696.93it/s]

Found 2196017 word vectors.





In [4]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:100]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (100 - len(embeds))
    return np.array(embeds)

In [5]:
val_vects = np.array([text_to_array(X_text) for X_text in (val_df["title"][:])])
val_y_labels = np.array(val_df["Category"])
val_y = np.zeros((len(val_y_labels), 58))
val_y[np.arange(len(val_y_labels)), val_y_labels] = 1

In [6]:
# Understand what a batch is made of
batch_size = 128
i = 0
texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
text_arr = np.array([text_to_array(text) for text in texts])
batch_labels = np.array(train_df["Category"][i*batch_size:(i+1)*batch_size])
batch_targets = np.zeros((batch_size, 58))
batch_targets[np.arange(batch_size), batch_labels] = 1
print(np.shape(text_arr))
print(np.shape(batch_targets))
print(text_arr)
print(batch_targets)

(128, 100, 300)
(128, 58)
[[[ 0.34834999 -0.03355     0.48804    ...  0.83594    -0.066742
   -0.33065   ]
  [ 0.43643999 -0.17101    -0.089925   ...  0.46792001  0.32587999
   -0.44595   ]
  [-0.20595001 -0.02593     0.90377998 ...  0.58156002 -0.38868001
   -0.62717003]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.0014461   0.39166    -0.087443   ... -0.0046101  -0.34825999
   -0.026866  ]
  [-0.17470001  0.036289   -0.049236   ... -0.26769     0.13463999
    0.14734   ]
  [-0.085851    0.12848     0.053524   ... -0.31143999  0.32584
    0.042455  ]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[-0.0047187  

In [7]:
# Write generator, which 
batch_size = 128

def batch_gen(train_df):
    n_batches = math.floor(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            batch_labels = np.array(train_df["Category"][i*batch_size:(i+1)*batch_size])
            batch_targets = np.zeros((batch_size, 58))
            batch_targets[np.arange(batch_size), batch_labels] = 1
            yield text_arr, batch_targets

# Training

In [8]:
from keras.models import Sequential
from keras.layers import CuDNNLSTM, Dense, Bidirectional, Activation

Using TensorFlow backend.


In [9]:
from keras.models import Model
from keras.layers import CuDNNLSTM, Dense, Bidirectional, Activation, Input, Dropout, Permute, Reshape, concatenate, Flatten
from keras.layers import BatchNormalization
from keras import regularizers

INPUT_DIM = 300
TIME_STEPS = 100
# if True, the attention vector is shared across the input_dimensions where the attention is applied.
SINGLE_ATTENTION_VECTOR = False

def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = concatenate([inputs, a_probs])
    return output_attention_mul

inputs_embed = Input((TIME_STEPS, INPUT_DIM))

lstm_1 = CuDNNLSTM(64, return_sequences=True)(inputs_embed)
lstm_2 = CuDNNLSTM(64, return_sequences=True)(lstm_1)

attention = attention_3d_block(lstm_2)
attention = Flatten()(attention)

fc_layer = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(attention)
fc_layer = BatchNormalization()(fc_layer)
fc_layer = Dropout(0.2)(fc_layer)
outputs = Dense(58, activation='softmax')(fc_layer)

model = Model(inputs=inputs_embed, outputs=outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [10]:
mg = batch_gen(train_df)

In [11]:
model.fit_generator(mg, epochs=5,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb7a784ee10>

In [12]:
model.fit_generator(mg, epochs=5,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb7a784ec50>

In [13]:
model.fit_generator(mg, epochs=5,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb79587cd30>

In [14]:
model.fit_generator(mg, epochs=5,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb7a784eda0>

In [15]:
model.fit_generator(mg, epochs=5,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb79587cda0>

# Inference

In [16]:
test_df = pd.read_csv("../input/ndsc-beginner/test.csv")
test_df["Supercategory"] = test_df["image_path"].str[0]
test_df.head()

Unnamed: 0,itemid,title,image_path,Supercategory
0,370855998,flormar 7 white cream bb spf 30 40ml,beauty_image/1588591395c5a254bab84042005f2a9f.jpg,b
1,637234604,maybelline clear smooth all in one bb cream sp...,beauty_image/920985ed9587ea20f58686ea74e20f93.jpg,b
2,690282890,murah innisfree eco natural green tea bb cream...,beauty_image/90b40e5710f54352b243fcfb0f5d1d7f.jpg,b
3,930913462,loreal white perfect day cream spf 17 pa white...,beauty_image/289c668ef3d70e1d929d602d52d5d78a.jpg,b
4,1039280071,hada labo cc cream ultimate anti aging spf 35 ...,beauty_image/d5b3e652c5822d2306f4560488ec30c6.jpg,b


In [17]:
# Make the prediction from the model
batch_size = 128
def batch_gen(test_df):
    n_batches = math.ceil(len(test_df) / batch_size)
    for i in range(n_batches):
        texts = test_df.iloc[i*batch_size:(i+1)*batch_size, 1]
        text_arr = np.array([text_to_array(text) for text in texts])
        yield text_arr

all_preds = []
for x in tqdm(batch_gen(test_df)):
    all_preds.extend(model.predict(x))

1347it [02:14, 10.00it/s]


In [18]:
print(np.shape(all_preds))
supercats = test_df["Supercategory"]
supercat_dict = {
    "b" : np.array([1]*17 + [0]*14 + [0]*27),
    "f" : np.array([0]*17 + [1]*14 + [0]*27),
    "m" : np.array([0]*17 + [0]*14 + [1]*27)
}


(172402, 58)


In [19]:
print(np.shape(all_preds))
y_te = [np.argmax(pred) for pred,supercat in zip(all_preds,supercats)]

submit_df = pd.DataFrame({"itemid": test_df["itemid"], "Category": y_te})
submit_df.to_csv("submission_no_cheat.csv", index=False)

(172402, 58)


In [20]:
print(np.shape(all_preds))
y_te = [np.argmax(pred*supercat_dict[supercat]) for pred,supercat in zip(all_preds,supercats)]

submit_df = pd.DataFrame({"itemid": test_df["itemid"], "Category": y_te})
submit_df.to_csv("submission.csv", index=False)

(172402, 58)


In [21]:
submit_df.head()

Unnamed: 0,itemid,Category
0,370855998,5
1,637234604,5
2,690282890,5
3,930913462,9
4,1039280071,5


In [22]:
submit_df.tail()

Unnamed: 0,itemid,Category
172397,1781957365,38
172398,1839851276,31
172399,955369303,35
172400,1638035772,33
172401,1498091427,34
