In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from numpy import random
from sklearn.preprocessing import LabelEncoder

In [2]:
tf.__version__

'2.1.0'

In [3]:
wv_model = Word2Vec.load('pad_word2vec_model')

In [4]:
data = pd.read_csv('processed_data.csv')
data = data[['descriptions', 'scores']]
data.head()

Unnamed: 0,descriptions,scores
0,<START> 摇 摇 摇 外婆桥 1930 年代 乡村 少年 唐 水生 来到 上海 投靠 ...,1
1,<START> 向日葵 本片 延续 张扬 洗澡 昨天 影片 呈现 中国式 父子 冲突 和解 ...,1
2,<START> 囧 <UNK> 囧 商业 <UNK> <UNK> 五年 时间 发明 一种 油...,1
3,<START> 少林寺 <UNK> 末年 隋 <UNK> 侄子 王仁则 <UNK> <UNK...,1
4,<START> 斗地主 故事 发生 民国初年 春天 青楼 当红 头牌 无数 男人 <UNK>...,0


In [5]:
def train_test_split(data):
    test_index = random.choice(range(len(data)), data.shape[0] // 4)
    test_data = data[data.index.isin(test_index)]
    train_data = data[~data.index.isin(test_index)]
    return train_data, test_data

train_data, test_data = train_test_split(data)

In [6]:
vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)}

embedding_matrix = wv_model.wv.vectors
embedding_matrix.shape

(30490, 100)

In [7]:
unk_index = vocab['<UNK>']
def transform_data(sentence,vocab):
    words=sentence.split(' ')
    idxs=[vocab[word] if word in vocab else unk_index for word in words]
    return idxs

train_idxs = train_data.apply(lambda x:transform_data(x[0],vocab), axis = 1)
test_idxs = test_data.apply(lambda x:transform_data(x[0],vocab), axis = 1)
all_idxs = data.apply(lambda x:transform_data(x[0],vocab), axis = 1)

In [8]:
train_idxs.head()

0    [30484, 8257, 8257, 8257, 20411, 7047, 159, 81...
1    [30484, 15250, 37, 1456, 2590, 9855, 7048, 16,...
2    [30484, 1607, 30485, 1607, 3043, 30485, 30485,...
3    [30484, 3046, 30485, 1699, 9858, 30485, 3604, ...
5    [30484, 2799, 2800, 20442, 1033, 2273, 186, 30...
dtype: object

In [9]:
train_x = np.array(train_idxs.tolist())
test_x = np.array(test_idxs.tolist())
all_x = np.array(all_idxs.tolist())
train_label = np.array(train_data['scores'].values)
test_label = np.array(test_data['scores'].values)
all_label = np.array(data['scores'].values)
label_list = [train_label, test_label, all_label]

encoder = LabelEncoder()
train_label = encoder.fit_transform(train_label)
encoder = LabelEncoder()
test_label = encoder.fit_transform(test_label)
encoder = LabelEncoder()
all_label = encoder.fit_transform(all_label)

In [10]:
train_x

array([[30484,  8257,  8257, ..., 30487, 30487, 30487],
       [30484, 15250,    37, ..., 30487, 30487, 30487],
       [30484,  1607, 30485, ..., 30487, 30487, 30487],
       ...,
       [30484, 14864, 30485, ..., 30487, 30487, 30487],
       [30484, 30485,  4319, ..., 30487, 30487, 30487],
       [30484, 30485,    37, ..., 30487, 30487, 30487]])

In [11]:
train_label[:20]

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [12]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Input, Dense, Activation, RepeatVector, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

In [13]:
def build_classifier(input_length, vocab_size, embedding_matrix = np.array([])):
    model = Sequential()
    if len(embedding_matrix):
        model.add(Embedding(input_dim = vocab_size, output_dim = 100, weights=[embedding_matrix], trainable=False,
                        input_length=input_length))
    else:
        model.add(Embedding(input_dim = vocab_size, output_dim = 100))
    model.add(Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout=0.2)))
    model.add(Dense(2))
    model.add(Activation('softmax'))
#     optimizer = Adam(learning_rate = 1e-5)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss = loss_object, optimizer = 'adam', metrics = ['accuracy'])
    
    model.summary()
    return model

In [14]:
input_length = train_x.shape[1]
vocab_size = len(vocab)
embedding_matrix = wv_model.wv.vectors
embedding_matrix.shape

(30490, 100)

In [15]:
input_length

353

In [16]:
model = build_classifier(input_length, vocab_size)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         3049000   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dense (Dense)                (None, 2)                 514       
_________________________________________________________________
activation (Activation)      (None, 2)                 0         
Total params: 3,284,010
Trainable params: 3,284,010
Non-trainable params: 0
_________________________________________________________________


In [17]:
num_epochs = 10
history = model.fit(train_x, train_label, 
                    epochs = num_epochs, 
                    validation_data=(test_x, test_label), 
                    verbose = 2)

Train on 2223 samples, validate on 612 samples
Epoch 1/10
2223/2223 - 57s - loss: 0.6888 - accuracy: 0.7049 - val_loss: 0.6853 - val_accuracy: 0.7239
Epoch 2/10
2223/2223 - 54s - loss: 0.6803 - accuracy: 0.7395 - val_loss: 0.6770 - val_accuracy: 0.7239
Epoch 3/10
2223/2223 - 53s - loss: 0.6705 - accuracy: 0.7395 - val_loss: 0.6670 - val_accuracy: 0.7239
Epoch 4/10
2223/2223 - 53s - loss: 0.6579 - accuracy: 0.7395 - val_loss: 0.6528 - val_accuracy: 0.7239
Epoch 5/10
2223/2223 - 55s - loss: 0.6377 - accuracy: 0.7395 - val_loss: 0.6285 - val_accuracy: 0.7239
Epoch 6/10
2223/2223 - 55s - loss: 0.6029 - accuracy: 0.7395 - val_loss: 0.5941 - val_accuracy: 0.7239
Epoch 7/10
2223/2223 - 55s - loss: 0.5831 - accuracy: 0.7395 - val_loss: 0.5907 - val_accuracy: 0.7239
Epoch 8/10
2223/2223 - 54s - loss: 0.5797 - accuracy: 0.7395 - val_loss: 0.5901 - val_accuracy: 0.7239
Epoch 9/10
2223/2223 - 56s - loss: 0.5767 - accuracy: 0.7395 - val_loss: 0.5894 - val_accuracy: 0.7239
Epoch 10/10
2223/2223 - 55

In [18]:
import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
    return

plot_graphs(history, "loss")

<Figure size 640x480 with 1 Axes>

In [19]:
results = model.evaluate(test_x, test_label)
print('test loss, test acc:', results)
predictions = model.predict(test_x)

test loss, test acc: [0.5892490127117805, 0.7238562]


In [20]:
sum(train_label) / len(train_label)

0.7395411605937922

In [25]:
predictions

array([[0.01612051, 0.9838795 ],
       [0.01521815, 0.98478186],
       [0.0152934 , 0.9847066 ],
       ...,
       [0.01663435, 0.9833657 ],
       [0.01654886, 0.9834511 ],
       [0.01623786, 0.9837622 ]], dtype=float32)

345