## Exercise - DL Tutorial 08

### Student names:  Franz Schulze, Benedikt Bauer, David Heim

Submit you solution by 30 June 23:59 to manuel.milling@informatik.uni-augsburg.de or maurice.gerczuk@informatik.uni-augsburg.de

In [7]:
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

w2v_embedding_file = "./res/data/embeddings/word2vec-40k-wiki-news-300d.vec"#"./res/data/embeddings/word2vec-40k-wiki-news-300d.vec"
ewe_embedding_file = "./res/data/embeddings/ewe-40k-300d.vec"

train_tsv = "./res/data/isear/train.tsv "
val_tsv = "./res/data/isear/val.tsv"
test_tsv = "./res/data/isear/test.tsv"

oov_id=1
pad_id=0
seq_length=128
batch_size=32
epochs=5

lr=0.005

2. Load word2vec embedding matrix and create word-index-dictionary.

In [8]:
def read_embedding_matrix(embedding_file, init_random: bool = False):
    read_content: list[str] = []
    
    # This seems a rather lenghty approach
    with open(embedding_file, 'rb') as vecfile:
        for line in vecfile.readlines():
            read_content.append(str(line))

    content: list[list[str]]
    content = list(map(lambda line: line.split(' '), read_content))
    
    labels_as_list = [row.pop(0) for row in content]
    labels_as_list = [label[2:] for label in labels_as_list]

    label_dict: dict[str, int] = {}
    temp_label_dict = [{value: index + 2} for index, value in enumerate(labels_as_list)]
    for key_value in temp_label_dict:
        label_dict.update(key_value)

    embedding_matrix: np.ndarray = None

    if init_random:
        embedding_matrix = np.random.randn(len(content), len(content[0]))
    else:
        # for the embedding matrix this is not really necessary
        content = [[string.replace('\\n', '').replace('\'', '').replace('\"', '') for string in row] for row in content]
        embedding_matrix = np.array(content, dtype=np.float32)

    matrix_padding = np.zeros(embedding_matrix.shape[1])
    oov_padding = np.mean(embedding_matrix, axis=0)
    embedding_matrix = np.insert(embedding_matrix, 0, matrix_padding, 0)
    embedding_matrix = np.insert(embedding_matrix, 1, oov_padding, 0)

    return embedding_matrix, label_dict
    

w2v_emb_matrix, w2v_word2idx = read_embedding_matrix(w2v_embedding_file)
print(f"w2v_emb_matrix:\t\t{w2v_emb_matrix.shape}")
print(f"w2v_word2idx shape:\t{len(w2v_word2idx)}")

w2v_emb_matrix:		(40002, 300)
w2v_word2idx shape:	40000


3. Prepare data:
- Load the sentences from the tsv files.
- Unify sentences (lower case, remove punctuation, etc.).
- Split sentences into words.
- Cut and zero pad sentences.
- Map words to indices.
- Map string labels to integers.

In [9]:
def read_tsv(tsv, word2idx, oov_id=1, pad_id=0, seq_length=128):
    read_content: list[str] = []
    with open(tsv, 'rb') as vecfile:
        for line in vecfile.readlines():
            read_content.append(str(line))
    content = list(map(lambda line: line.split('\\t'), read_content))
    label_content = [row.pop(1) for row in content]
    
    content = [[re.sub(r'^b|[^\w\s]|\\r|\\t', '', re.sub(r'\\n|\\r|\\t', '', string.lower().strip())) for string in row] for row in content]
    label_content = [re.sub(r'^b|[^\w\s]|\\r|\\t', '', re.sub(r'\\n|\\r|\\t', '', string.lower().strip())) for string in label_content]
    
    content = [[string.split(' ') for string in row] for row in content]
    content = [item for sublist in content for item in sublist]

    content = [[word2idx.get(string, oov_id) for string in row] for row in content]
    content = [equal_length(sequence, seq_length, pad_id) for sequence in content]
    data_x = np.array(content, dtype=np.int32)

    unique_labels = set(label_content)
    num_labels = len(unique_labels)
    temp_label_dict = [{value: index} for index, value in enumerate(unique_labels)]
    label_dict: dict[str, int] = {}
    for key_value in temp_label_dict:
        label_dict.update(key_value)
    
    label_content = [label_dict[string] for string in label_content]

    return data_x, to_categorical(label_content, num_classes=num_labels)

def equal_length(sequence: list[int], desired_length: int, padding_id: int):
    current_length = len(sequence)
    if current_length < desired_length:
        sequence.extend(padding_id for i in range(desired_length - current_length))
    elif current_length > desired_length:
        sequence = sequence[:desired_length]
    return sequence


train_X, train_y = read_tsv(train_tsv, w2v_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)
val_X, val_y = read_tsv(val_tsv, w2v_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)
test_X, test_y = read_tsv(test_tsv, w2v_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)

# convert labels from string to int

print(f"x_train shape:\t\t{train_X.shape}")
print(f"x_validation shape:\t{val_X.shape}")
print(f"x_test shape:\t\t{test_X.shape}")
print(f"y_train shape:\t\t{train_y.shape}")
print(f"y_validation shape:\t{val_y.shape}")
print(f"y_test shape:\t\t{test_y.shape}")

x_train shape:		(5976, 128)
x_validation shape:	(752, 128)
x_test shape:		(736, 128)
y_train shape:		(5976, 7)
y_validation shape:	(752, 7)
y_test shape:		(736, 7)


4. Initialise, train  and evaluate model.

In [10]:
model = keras.Sequential([
    Embedding(input_dim=len(w2v_word2idx.keys()) + 2, output_dim=300, mask_zero=True, 
              embeddings_initializer=keras.initializers.Constant(w2v_emb_matrix), trainable=False),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.5)),
    Bidirectional(LSTM(64, dropout=0.5)),
    Dense(7, activation='softmax')]
)
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=lr), metrics='accuracy')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         12000600  
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         186880    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 903       
Total params: 12,287,199
Trainable params: 286,599
Non-trainable params: 12,000,600
_________________________________________________________________


In [11]:
model.fit(x=train_X, y=train_y, epochs=5, batch_size=32, validation_data=(val_X, val_y))

Epoch 1/5


InvalidArgumentError:  logits and labels must have the same first dimension, got logits shape [32,7] and labels shape [224]
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-11-5ef8bfcfd7af>:1) ]] [Op:__inference_train_function_55474]

Function call stack:
train_function


In [None]:
model.evaluate(x=test_X, y=test_y)

5. EWE embeddings.

In [None]:
ewe_emb_matrix, ewe_word2idx = read_embedding_matrix(ewe_embedding_file)
print(f"ewe_emb_matrix:\t\t{ewe_emb_matrix.shape}")
print(f"ewe_word2idx shape:\t{len(ewe_word2idx)}")

FileNotFoundError: [Errno 2] No such file or directory: './res/data/embeddings/ewe-40k-wiki-news-300d.vec'

In [None]:
model_ewe = Sequential()
model_ewe.add(Embedding(ewe_emb_matrix.shape[0], ewe_emb_matrix.shape[1], weights=[ewe_emb_matrix], mask_zero=True, trainable=False, input_length=128))
model_ewe.add(Bidirectional(LSTM(64, dropout=0.5, return_sequences=True)))
model_ewe.add(Bidirectional(LSTM(64, dropout=0.5)))
model_ewe.add(Dense(7, activation='softmax'))
model_ewe.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=lr), metrics=['accuracy'])
model_ewe.summary()

model_ewe.fit(x=train_X, y=train_y, validation_data=(val_X, val_y), batch_size=32, epochs=5)

model_ewe.evaluate(test_X, test_y)

6. Custom word embeddings.

In [None]:
rand_emb_matrix, rand_word2idx = read_embedding_matrix(w2v_embedding_file, True)
print(f"rand_emb_matrix:\t\t{rand_emb_matrix.shape}")
print(f"rand_word2idx shape:\t\t{len(rand_word2idx)}")

In [None]:
model_rand = Sequential()
model_rand.add(Embedding(rand_emb_matrix.shape[0], rand_emb_matrix.shape[1], weights=[rand_emb_matrix], mask_zero=True, input_length=128))
model_rand.add(Bidirectional(LSTM(64, dropout=0.5, return_sequences=True)))
model_rand.add(Bidirectional(LSTM(64, dropout=0.5)))
model_rand.add(Dense(7, activation='softmax'))
model_rand.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=lr), metrics=['accuracy'])
model_rand.summary()

model_rand.fit(x=train_X, y=train_y, validation_data=(val_X, val_y), batch_size=32, epochs=5)

model_rand.evaluate(test_X, test_y)
