In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from keras.utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from keras.callbacks import Callback
import keras.backend as K
from functools import partial
from itertools import product

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data_train = pd.read_json('train.json')
data_val = pd.read_json('val.json')
MAX_SEQUENCE_LENGTH = data_train.shape[1]

In [3]:
vectors = np.load("GloVe_codeswitch_5k.npy")
words = np.load('5k_vocab_dict.npy').item()
EMBEDDING_DIM = len(vectors[0])

In [4]:
english = "eng"
spanish = "span"
other = "other"

switch = "switch"
noswitch = "noswitch"

In [5]:
def createExamplesLabels(data):
    examples = []
    labels = []
    num_reviews, review_length = data.shape

    for r in range(num_reviews):
        review_string = ""
        label_vec = []

        for w in range(review_length):

            currWordStruct = data[w][r]

            if currWordStruct == None:
                break
                
            currWord = currWordStruct[0]

            if currWord in words:
                review_string += (" " + currWord)
            else:
                review_string += (" <UNK>")

            if w < (review_length - 1):
                nextWordStruct = data[w + 1][r]
                if nextWordStruct:

                    nextWord = nextWordStruct[0]
                    nextLang = nextWordStruct[1]

                    if nextLang == 'eng':
                        label_vec.append(english)

                    elif nextLang == 'spa':
                        label_vec.append(spanish)

                    elif nextLang == 'eng&spa' or 'eng+spa' or 'spa+eng':
                        label_vec.append(other)

                else:
                    label_vec.append(other)

        labels.append(label_vec)
        examples.append(review_string)
    
    return examples, labels
    

In [6]:
def createExamplesBinary(data):
    examples = []
    labels = []
    num_reviews, review_length = data.shape

    for r in range(num_reviews):
        review_string = ""
        label_vec = []

        for w in range(review_length):

            currWordStruct = data[w][r]

            if currWordStruct == None:
                break
                
            currWord = currWordStruct[0]
            currLang = currWordStruct[1]

            if currWord in words:
                review_string += (" " + currWord)
            else:
                review_string += (" <UNK>")

            if w < (review_length - 1):
                nextWordStruct = data[w + 1][r]
                if nextWordStruct:

                    nextWord = nextWordStruct[0]
                    nextLang = nextWordStruct[1]

                    if currLang != nextLang:
                        label_vec.append(switch)

                    else:
                        label_vec.append(noswitch)

                else:
                    label_vec.append(noswitch)

        labels.append(label_vec)
        examples.append(review_string)
    
    return examples, labels
    

In [7]:
examples_train, labels_train = createExamplesBinary(data_train)
examples_val, labels_val = createExamplesBinary(data_val)

In [8]:
tokenizer = Tokenizer(num_words=len(vectors), filters="", lower=False)
tokenizer.fit_on_texts(examples_train)
sequences_train = tokenizer.texts_to_sequences(examples_train)
sequences_val = tokenizer.texts_to_sequences(examples_val)

word_index = tokenizer.word_index
train_data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(sequences_val, maxlen=MAX_SEQUENCE_LENGTH)

In [9]:
embedding_dict = {}
for k,v in words.items():
    embedding_dict[k] = vectors[v]

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    embedding_matrix[i] = embedding_vector

In [10]:
le = preprocessing.LabelEncoder()
#le.fit([other, english, spanish])
le.fit([switch, noswitch])
label_transform_train = np.zeros((len(labels_train), MAX_SEQUENCE_LENGTH, 2))
for i, vec in enumerate(labels_train):

    curr = to_categorical(pad_sequences([le.transform(vec)], maxlen=MAX_SEQUENCE_LENGTH), num_classes = 2)[0]
    label_transform_train[i,:,:] = curr

label_transform_val = np.zeros((len(labels_val), MAX_SEQUENCE_LENGTH, 2))
for i, vec in enumerate(labels_val):

    curr = to_categorical(pad_sequences([le.transform(vec)], maxlen=MAX_SEQUENCE_LENGTH), num_classes = 2)[0]
    label_transform_val[i,:,:] = curr
    
#keys = list(le.classes_)
#vals = le.transform(keys)
#labels_index = dict(zip(keys,vals))

In [11]:
print(np.sum(np.argmax(label_transform_val.reshape(-1, 2), axis=1)))

1472


In [12]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
    
    def on_epoch_end(self, epoch, logs={}):
        val_predict = np.argmax((np.asarray(self.model.predict(val_data))).reshape(-1, 2), axis=1)
        val_targ = np.argmax(label_transform_val.reshape(-1, 2), axis=1)
        print(np.sum(val_predict))
        _val_f1 = f1_score(val_targ, val_predict, average='binary')
        _val_recall = recall_score(val_targ, val_predict, average='binary')
        _val_precision = precision_score(val_targ, val_predict, average='binary')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" % (_val_f1, _val_precision, _val_recall))
        return
    
metrics = Metrics()

In [14]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

def loss(y_true, y_pred, weights):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss

our_loss = partial(loss, weights=K.variable([1, 7]))

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(1000, return_sequences=True, name="LSTM"))
model.add(TimeDistributed(Dense(2, activation='softmax')))
model.compile(loss=our_loss, optimizer='adam', metrics=['accuracy'])
print(model.summary())
#model.train_on_batch(data[4:8,:], label_transform[4:8,:,:])
#model.train_on_batch(data[8:12,:], label_transform[8:12,:,:])
#model.predict_on_batch(data[10:20,:])
results = model.fit(train_data, label_transform_train, epochs=20, validation_data = (val_data, label_transform_val), batch_size=100, callbacks=[metrics])
#results = model.fit(val_data, label_transform_val, epochs=6, validation_data = (val_data, label_transform_val), batch_size=100, callbacks=[metrics])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 52, 300)           841800    
_________________________________________________________________
LSTM (LSTM)                  (None, 52, 1000)          5204000   
_________________________________________________________________
time_distributed_2 (TimeDist (None, 52, 2)             2002      
Total params: 6,047,802
Trainable params: 5,206,002
Non-trainable params: 841,800
_________________________________________________________________
None
Train on 3544 samples, validate on 1000 samples
Epoch 1/20
4244
— val_f1: 0.312456 — val_precision: 0.210415 — val_recall 0.606658
Epoch 2/20
3908
— val_f1: 0.370260 — val_precision: 0.254862 — val_recall 0.676630
Epoch 3/20
4432
— val_f1: 0.371951 — val_precision: 0.247744 — val_recall 0.745924
Epoch 4/20
 300/3544 [=>............................] - ETA: 1:27 - loss: 0.2107 - acc: 

KeyboardInterrupt: 

In [None]:
print(results.history)