In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

In [3]:
trainset = pd.read_csv("./data/local_train_set.csv")
testset = pd.read_csv("./data/local_test_set.csv")

In [12]:
trainset.head()

Unnamed: 0,Product_ID,User_ID,Time_ID,HelpfulnessNumerator,HelpfulnessDenominator,CleanedText,Score
0,65149,233944,1973,9,14,look small print bottom back packag product ch...,1
1,26218,38234,633,10,10,updat april 2012 i return 6 pouch purebit chic...,1
2,18722,96355,315,0,1,was wonder gum tast differ textur slimi read i...,1
3,61569,225352,828,2,5,well reason i go i need give morn caffein habi...,1
4,6393,41023,170,0,0,i bought product fiber prune juic aid son cons...,1


In [34]:
def Build_Dict(trainset, testset):
    count_of_words = {}
    words = (' '.join(list(trainset['CleanedText'].values) + list(testset['CleanedText'].values))).split()
    
    for word in words:
        if not word in count_of_words:
            count_of_words[word] = 1
        else:
            count_of_words[word] += 1
    
    dic = {}
    for word in count_of_words:
        if count_of_words[word] <= 10: # Nan Threshold
            dic[word] = 0
        dic[word] = len(dic) + 1
    
    return dic

dic = Build_Dict(trainset, testset)
Vocab_Size = len(dic) + 2 # Nan

print("Total vocabularies : %d" % Vocab_Size)

Total vocabularies : 8582


In [35]:
def CleanedTextToTensor(raw_X):
    X = [x.split() for x in raw_X]
    X = [[dic[t] for t in x] for x in X]
    
    max_len = max([len(x) for x in X])
    X = [x + [0] * (max_len - len(x)) for x in X]
    X = np.array(X)
    X = tf.convert_to_tensor(X, dtype=tf.float32)
    
    return X
    
train_X = CleanedTextToTensor(trainset['CleanedText'].values)

print('train_X shape:', train_X.shape)

train_X shape: (2500, 837)


In [36]:
def ScoreToTensor(raw_Y):
    Y = np.array(raw_Y) - 1 # convert to [0, 4]
    Y = [[int(t == label) for t in range(5)] for label in Y]
    Y = tf.convert_to_tensor(Y, dtype=tf.float32)
    
    return Y

train_Y = ScoreToTensor(trainset['Score'].values)
print('train_Y shape:', train_Y.shape)

train_Y shape: (2500, 5)


In [37]:
hidden_size = 128

model = keras.Sequential([
    keras.layers.Embedding(Vocab_Size, hidden_size),
    keras.layers.Bidirectional(keras.layers.LSTM(hidden_size)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(5, activation='sigmoid')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         1098496   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_4 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 165       
Total params: 1,370,053
Trainable params: 1,370,053
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
test_X = CleanedTextToTensor(testset['CleanedText'])
test_Y = ScoreToTensor(testset['Score'])

history = model.fit(x=train_X, y=train_Y, epochs = 10, validation_data=(test_X, test_Y), shuffle=
                    'steps_per_epoch')

Train on 2500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
 128/2500 [>.............................] - ETA: 53s - loss: 0.7788 - accuracy: 0.6875

KeyboardInterrupt: 

In [195]:
output = model.predict(X)
output = [t.argmax() for t in output]