Read embeddings matrix.

In [2]:
import os
import numpy as np
from gensim.models import KeyedVectors


# Create embeddings_index

kv = KeyedVectors.load_word2vec_format(
        os.path.join('data', 'GoogleNews-vectors-negative300.bin'), 
        binary = True
      )

embeddings_index = {}
for word, vector in zip(list(kv.index_to_key), kv.vectors):
    coefs = np.asarray(vector, dtype='float32')
    embeddings_index[word] = coefs

Read pure data and split it.

In [11]:
import tensorflow as tf
import string
import random

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import one_hot

PADDING_LENGTH = 89
tf.random.set_seed(1337)
random.seed(1337)

def preprocess(text):
    # Split the text by space
    
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(translator)
    
    text = text.split()
    return text

def to_embedding(tokens, embeddings_index):
    # Transform the tokens into embeddings
    
    embeddings = []
    for i in range(0, len(tokens)):
        try:
            embeddings.append(embeddings_index[tokens[i]])
        except:
            continue
    return embeddings

def add_padding(embeddings, padding_width = None):
    #
    
    emb_padded = pad_sequences(embeddings, maxlen=padding_width, padding='pre', dtype='float32')
    return emb_padded

def process_text(sentences, embeddings_index, padding = None):
    # Combine the processes
    
    result = [ preprocess(sentence) for sentence in sentences ]
    result = [ to_embedding(sentence, embeddings_index) for sentence in result ]
    result = add_padding(result, padding)
    return result

# Read labeled data

with open(os.path.join('data', 'party.labeled.txt'), 'r', encoding="utf-8") as f:
    data = f.read().strip().split('\n')


pure_data = [[text, label] for sent_id, label, text in [line.split('\t', 2) for line in data]]

# Split the pure_data

middle = 50

pure_data_train = pure_data[:middle]
pure_data_test = pure_data[middle:]

Data augmentation.

Method 1: Random words replacing with similar word.

In [3]:
# Replace some words in the original sentence

def replace_random_word(pure_data_train):
    NUMBER_REPLACED = 4

    pdt_r = []

    for d in pure_data_train:

        text = d[0]
        label = d[1]

        # Clean the punctuation
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        text = text.translate(translator)
        t = text.split()

        pdt_r.append([' '.join(t), label])

        # Generate the indices of the words that are about to being replaced
        replaced_index = []

        while(len(replaced_index) < NUMBER_REPLACED):
            r = random.randrange(len(t))
            if(t[r] == 'party' or r in replaced_index):
                continue
            replaced_index.append(r)

        for idx in replaced_index:

            temp_t = t.copy()
            # Replace the word with the most similar word
            try:
                temp_t[idx] = kv.most_similar(positive=[t[idx]], topn=1)[0][0]
                pdt_r.append([' '.join(temp_t), label])
            except:
                pass
            
    return pdt_r

pdt_r = replace_random_word(pure_data_train)

Method 2: Back Translation

In [6]:
from google.cloud import translate

credential_path = os.path.join(os.path.expanduser('~'), 'translator.json')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

def translate_text(
    text='Hello, world!', 
    project_id='directed-bongo-336812', 
    source_language_code='en-US', 
    target_language_code='zh-TW'):

    client = translate.TranslationServiceClient()
    location = "global"
    parent = f"projects/{project_id}/locations/{location}"

    response = client.translate_text(
        request={
            "parent": parent,
            "contents": [text],
            "mime_type": "text/plain",
            "source_language_code": source_language_code,
            "target_language_code": target_language_code,
        }
    )

    return response.translations[0].translated_text

def back_translation(pure_data_train):
    
    
    project_id = 'directed-bongo-336812'
    
    pdt_b = []
    
    for d in pure_data_train:
        
        pdt_b.append(d)
        
        text = d[0]
        label = d[1]
        
        tw_text = translate_text(
            text=text, 
            project_id=project_id, 
            source_language_code='en-US', 
            target_language_code='zh-TW')
        
        back_tw_text = translate_text(
            text=tw_text, 
            project_id=project_id, 
            source_language_code='zh-TW', 
            target_language_code='en-US')
        
        pdt_b.append([back_tw_text, label])
        
        pl_text = translate_text(
            text=text, 
            project_id=project_id, 
            source_language_code='en-US', 
            target_language_code='pl')
        
        back_pl_text = translate_text(
            text=pl_text, 
            project_id=project_id, 
            source_language_code='pl', 
            target_language_code='en-US')
        
        pdt_b.append([back_pl_text, label])
        
        th_text = translate_text(
            text=text, 
            project_id=project_id, 
            source_language_code='en-US', 
            target_language_code='th')
        
        back_th_text = translate_text(
            text=th_text, 
            project_id=project_id, 
            source_language_code='th', 
            target_language_code='en-US')
        
        pdt_b.append([back_th_text, label])
        
    return pdt_b
    
pdt_b = back_translation(pure_data_train)

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\users\\pride\\anaconda3\\lib\\site-packages\\grpcio-1.42.0.dist-info\\METADATA'

Method 3: Delete random word

In [7]:
def delete_random_word(pure_data_train):
    pdt_d = []
    
    for d in pure_data_train:
        
        pdt_d.append(d)
        
        text = d[0]
        label = d[1]
        
        # Clean the punctuation
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        text = text.translate(translator)
        t = text.split()
        
        r0 = random.randrange(len(t))
        r1 = random.randrange(len(t))
        while(t[r0] == 'party' or t[r1] == 'party' or r0 == r1):
            r0 = random.randrange(len(t))
            r1 = random.randrange(len(t))
        
        temp_t = t.copy()
        del temp_t[r0]
        pdt_d.append([' '.join(temp_t), label])
        
        temp_t = t.copy()
        del temp_t[r1]
        pdt_d.append([' '.join(temp_t), label])
        
        temp_t = t.copy()
        del temp_t[r0]
        del temp_t[r1-1]
        pdt_d.append([' '.join(temp_t), label])
        
    return pdt_d
        
pdt_d = delete_random_word(pure_data_train)

Method 4: Swap random two word

In [6]:
def swap_random_words(pure_data_train):
    pdt_s = []
    
    for d in pure_data_train:
        
        pdt_s.append(d)
        
        text = d[0]
        label = d[1]
        
        # Clean the punctuation
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        text = text.translate(translator)
        t = text.split()
        
        r0 = random.randrange(len(t))
        r1 = random.randrange(len(t))
        while(t[r0] == 'party' or t[r1] == 'party' or r0 == r1):
            r0 = random.randrange(len(t))
            r1 = random.randrange(len(t))
        
        t = t.copy()
        
        temp = t[r0]
        t[r0] = t[r1]
        t[r1] = temp

        pdt_s.append([' '.join(t), label])
        
        
        r0 = random.randrange(len(t))
        r1 = random.randrange(len(t))
        while(t[r0] == 'party' or t[r1] == 'party' or r0 == r1):
            r0 = random.randrange(len(t))
            r1 = random.randrange(len(t))
            
        temp = t[r0]
        t[r0] = t[r1]
        t[r1] = temp
        
        pdt_s.append([' '.join(t), label])
        
        r0 = random.randrange(len(t))
        r1 = random.randrange(len(t))
        while(t[r0] == 'party' or t[r1] == 'party' or r0 == r1):
            r0 = random.randrange(len(t))
            r1 = random.randrange(len(t))
            
        temp = t[r0]
        t[r0] = t[r1]
        t[r1] = temp
        
        pdt_s.append([' '.join(t), label])
        
    return pdt_s
        
pdt_s = swap_random_words(pure_data_train)

Combine the augmentation process.

In [7]:
final_data_train = pure_data_train.copy()

final_data_train = swap_random_words(final_data_train)
final_data_train = delete_random_word(final_data_train)

Train the model.

In [12]:
# Specify the data source
data_train = pdt_d

# Process input
X_train = process_text([d[0] for d in data_train], embeddings_index, PADDING_LENGTH)
X_test = process_text([d[0] for d in pure_data_test], embeddings_index, PADDING_LENGTH)

Y_train = [int(d[1])-1 for d in data_train]
Y_train = tf.one_hot(Y_train, 3, axis=1, dtype=tf.float32)
Y_test = [int(d[1])-1 for d in pure_data_test]
Y_test = tf.one_hot(Y_test, 3, axis=1, dtype=tf.float32)

print(f"[Train size: Test size]: [{X_train.shape}: {X_test.shape}]")

# Training

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

_, PADDING_WIDTH, EMBEDDING_DIM = X_train.shape
BATCH_SIZE = 2
EPOCHS = 3
OUTPUT_CATEGORY = len(SENSE)

model = Sequential()
model.add(tf.keras.layers.LSTM(4))
model.add(tf.keras.layers.Dense(64))
model.add(tf.keras.layers.Dense(OUTPUT_CATEGORY, activation='sigmoid'))

model.compile(optimizer='Adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                metrics=['accuracy'])

history = model.fit(
    X_train, Y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)


# Evaluate

results = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
print(f"Test loss: {results[0]}")
print(f"Test accuracy: {results[1]}")

[Train size: Test size]: [(200, 89, 300): (657, 89, 300)]
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss: 0.7746986150741577
Test accuracy: 0.7747336626052856


In [13]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 4)                 4880      
                                                                 
 dense_4 (Dense)             (None, 64)                320       
                                                                 
 dense_5 (Dense)             (None, 3)                 195       
                                                                 
Total params: 5,395
Trainable params: 5,395
Non-trainable params: 0
_________________________________________________________________
