In [1]:
# Import a Pretrained WordVector(Word2Vec) from Google
import gensim

In [41]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [188]:
# this should be true for GPU use
import tensorflow as tf
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

False

In [189]:
tf.test.is_built_with_cuda()

False

In [6]:
ds = pd.read_csv('df_text_eng.csv', index_col='Unnamed: 0')

In [172]:
ds.head()

Unnamed: 0,blurb,state
1,"Using their own character, users go on educati...",0
2,"MicroFly is a quadcopter packed with WiFi, 6 s...",1
3,"A small indie press, run as a collective for a...",0
4,Zylor is a new baby cosplayer! Back this kicks...,0
5,Hatoful Boyfriend meet Skeletons! A comedy Dat...,0


In [171]:
ds['state'] = ds['state'].apply(lambda x: 0 if x == 'failed' else 1)

In [10]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heeyoungmy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [25]:
>>> import nltk
>>> nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heeyoungmy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [133]:
blurbs = list()
lines = ds['blurb'].values.tolist()

In [134]:
#turn NaN values into string
for i in range(len(lines)):
    if type(lines[i]) == float:
        lines[i] = ''

In [135]:
for line in tqdm(lines):
    tokens = word_tokenize(line)
    # to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    blurbs.append(words)

100%|████████████████████████████████████████████████████████████████████████| 215513/215513 [01:36<00:00, 2230.38it/s]


In [139]:
#train word2vec model
model = gensim.models.Word2Vec(sentences=blurbs, size=300, window =5, workers=4, min_count=1)

In [140]:
# export the word embedding
model.wv.save_word2vec_format('blurb_word2vec.txt', binary=False)

In [141]:
import os

embeddings_index= {}
f = open(os.path.join('', 'blurb_word2vec.txt'), encoding = 'utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

117425it [00:08, 13863.95it/s]


In [142]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [166]:
# Set the max length

length = list()

for i in range(len(blurbs)):
    length.append(len(blurbs[i]))

max_length = max(length)

In [173]:
VALIDATION_SPLIT = 0.2

# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(blurbs)
sequences = tokenizer_obj.texts_to_sequences(blurbs)

# pad sequences
word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

blurb_pad = pad_sequences(sequences, maxlen=max_length)
state =  ds['state'].values
print('Shape of blurb tensor:', blurb_pad.shape)
print('Shape of state tensor:', state.shape)

# split the data into a training set and a validation set
indices = np.arange(blurb_pad.shape[0])
np.random.shuffle(indices)
blurb_pad = blurb_pad[indices]
state = state[indices]
num_validation_samples = int(VALIDATION_SPLIT * blurb_pad.shape[0])

X_train_pad = blurb_pad[:-num_validation_samples]
y_train = state[:-num_validation_samples]
X_test_pad = blurb_pad[-num_validation_samples:]
y_test = state[-num_validation_samples:]

Found 117424 unique tokens.
Shape of blurb tensor: (215513, 26)
Shape of state tensor: (215513,)


In [174]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

Shape of X_train_pad tensor: (172411, 26)
Shape of y_train tensor: (172411,)
Shape of X_test_pad tensor: (43102, 26)
Shape of y_test tensor: (43102,)


In [175]:
EMBEDDING_DIM =300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in tqdm(word_index.items()):
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

100%|████████████████████████████████████████████████████████████████████████| 117424/117424 [00:18<00:00, 6474.33it/s]


In [180]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant

# define model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Summary of the built model...
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 26, 300)           35227500  
_________________________________________________________________
gru (GRU)                    (None, 32)                32064     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 35,259,597
Trainable params: 32,097
Non-trainable params: 35,227,500
_________________________________________________________________
None


In [183]:
import tensorflow as tf
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

False

In [181]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Train on 172411 samples, validate on 43102 samples
Epoch 1/25


KeyboardInterrupt: 

In [None]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))