In [36]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

from sklearn.utils import shuffle

In [None]:
def Build_Dict(trainset, testset):
    count_of_words = {}
    words = (' '.join(list(trainset['CleanedText'].values) + list(testset['CleanedText'].values))).split()
    
    for word in words:
        if not word in count_of_words:
            count_of_words[word] = 1
        else:
            count_of_words[word] += 1
    
    dic = {}
    for word in count_of_words:
        if count_of_words[word] <= 10: # Nan Threshold
            continue
        dic[word] = len(dic)
    
    return dic

In [None]:
def CleanedTextToTensor(raw_X):
    X = [x.split() for x in raw_X]
    X = [[dic[t] for t in x if t in dic] for x in X]
    
    max_len = max([len(x) for x in X])
    X = [x + [0] * (max_len - len(x)) for x in X]
    X = np.array(X)
    X = tf.convert_to_tensor(X, dtype=tf.float32)
    
    return X

In [None]:
def ScoreToTensor(raw_Y):
    Y = np.array(raw_Y) - 1 # convert to [0, 4]
    Y = [[int(t == label) for t in range(5)] for label in Y]
    Y = tf.convert_to_tensor(Y, dtype=tf.float32)
    
    return Y

In [37]:
trainset = pd.read_csv("./data/local_train_set.csv")
testset = pd.read_csv("./data/local_test_set.csv")

In [38]:
testset.head()

Unnamed: 0,Product_ID,User_ID,Time_ID,HelpfulnessNumerator,HelpfulnessDenominator,CleanedText,Score
0,71769,249425,115,5,5,make no sens purchas six box 72 four pack avai...,1
1,22871,112306,240,4,6,we look healthi snack three children purchas c...,1
2,7201,44556,116,3,3,i order product hope amazon carri ship newli b...,1
3,22766,31988,162,0,0,i heard wonder thing noodl low cal includ thou...,1
4,27203,127458,2273,2,2,sent relat base posit experi product the recip...,1


In [None]:
dic = Build_Dict(trainset, testset)
Vocab_Size = len(dic) # Nan

print("Total vocabularies : %d" % Vocab_Size)

In [42]:
train_X = CleanedTextToTensor(trainset['CleanedText'].values)
print('train_X shape:', train_X.shape)
train_Y = ScoreToTensor(trainset['Score'].values)
print('train_Y shape:', train_Y.shape)

test_X = CleanedTextToTensor(testset['CleanedText'])
print('train_X shape:', test_X.shape)
test_Y = ScoreToTensor(testset['Score'])
print('train_Y shape:', test_Y.shape)

train_X shape: (2500, 729)
train_Y shape: (2500, 5)
train_X shape: (500, 453)
train_Y shape: (500, 5)


In [15]:
hidden_size = 128

model = keras.Sequential([
    keras.layers.Embedding(Vocab_Size, hidden_size),
    keras.layers.Bidirectional(keras.layers.LSTM(hidden_size)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         228864    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_2 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 165       
Total params: 500,421
Trainable params: 500,421
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
history = model.fit(x=train_X, y=train_Y, epochs = 3, validation_data=(test_X, test_Y), shuffle=
                    'steps_per_epoch')

Train on 2500 samples, validate on 500 samples
Epoch 1/3
Epoch 2/3
  96/2500 [>.............................] - ETA: 50s - loss: 0.7416 - accuracy: 0.7656

KeyboardInterrupt: 

In [44]:
preds = model.predict(test_X)
preds = preds.argmax(1)

truths = testset['Score'].values - 1

array([2, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 4, 2, 0, 0,
       0, 0, 3, 1, 0, 4, 1, 0, 0, 4, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 3, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 2, 0, 1,
       0, 3, 0, 4, 0, 0, 4, 3, 1, 0, 0, 0, 0, 0, 1, 2, 1, 4, 0, 0, 1, 4,
       1, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1, 1,
       0, 3, 0, 0, 0, 1, 3, 3, 0, 4, 1, 0, 3, 4, 0, 0, 4, 0, 4, 0, 4, 0,
       1, 0, 1, 1, 1, 0, 3, 1, 1, 1, 2, 4, 1, 2, 1, 0, 0, 1, 3, 1, 0, 1,
       1, 1, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 4, 1, 0, 0, 2, 0, 0, 3, 1, 1,
       1, 0, 1, 4, 4, 0, 0, 1, 0, 2, 1, 0, 1, 0, 0, 0, 0, 2, 3, 3, 1, 3,
       0, 1, 1, 4, 2, 0, 3, 2, 0, 0, 3, 3, 0, 2, 4, 3, 1, 1, 1, 3, 1, 2,
       0, 4, 2, 1, 3, 0, 4, 2, 2, 3, 1, 2, 4, 2, 4, 4, 2, 0, 4, 0, 3, 3,
       1, 4, 1, 1, 4, 2, 1, 3, 0, 4, 0, 1, 1, 2, 2, 3, 3, 0, 1, 0, 3, 4,
       3, 0, 3, 1, 4, 2, 1, 1, 0, 2, 3, 1, 1, 3, 2, 3, 1, 2, 1, 1, 3, 1,
       3, 3, 1, 4, 1, 0, 0, 4, 1, 1, 3, 3, 2, 3, 1,

In [46]:
res = [[0]*5 for i in range(5)]

for pred, truth in zip(preds, truths):
    res[truth][pred] += 1

tot = truths.size // 5

for i in range(5):
    print([x / tot for x in res[i]])

[0.61, 0.17, 0.06, 0.07, 0.09]
[0.38, 0.35, 0.06, 0.12, 0.09]
[0.16, 0.27, 0.18, 0.24, 0.15]
[0.14, 0.19, 0.07, 0.28, 0.32]
[0.06, 0.12, 0.08, 0.15, 0.59]
