In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from keras.utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from keras.callbacks import Callback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
data_train = pd.read_json('train.json')
data_val = pd.read_json('val.json')
MAX_SEQUENCE_LENGTH = data_train.shape[1]

In [14]:
vectors = np.load("GloVe_codeswitch_5k.npy")
words = np.load('5k_vocab_dict.npy').item()
EMBEDDING_DIM = len(vectors[0])

In [47]:
english = "eng"
spanish = "span"
other = "other"

switch = "switch"
noswitch = "noswitch"

In [16]:
def createExamplesLabels(data):
    examples = []
    labels = []
    num_reviews, review_length = data.shape

    for r in range(num_reviews):
        review_string = ""
        label_vec = []

        for w in range(review_length):

            currWordStruct = data[w][r]

            if currWordStruct == None:
                break
                
            currWord = currWordStruct[0]

            if currWord in words:
                review_string += (" " + currWord)
            else:
                review_string += (" <UNK>")

            if w < (review_length - 1):
                nextWordStruct = data[w + 1][r]
                if nextWordStruct:

                    nextWord = nextWordStruct[0]
                    nextLang = nextWordStruct[1]

                    if nextLang == 'eng':
                        label_vec.append(english)

                    elif nextLang == 'spa':
                        label_vec.append(spanish)

                    elif nextLang == 'eng&spa' or 'eng+spa' or 'spa+eng':
                        label_vec.append(other)

                else:
                    label_vec.append(other)

        labels.append(label_vec)
        examples.append(review_string)
    
    return examples, labels
    

In [62]:
def createExamplesBinary(data):
    examples = []
    labels = []
    num_reviews, review_length = data.shape

    for r in range(num_reviews):
        review_string = ""
        label_vec = []
        flag = -1

        for w in range(review_length):

            currWordStruct = data[w][r]

            if currWordStruct == None:
                break
                
            currWord = currWordStruct[0]
            currLang = currWordStruct[1]

            if currWord in words:
                review_string += (" " + currWord)
            else:
                review_string += (" <UNK>")

            if w < (review_length - 1):
                nextWordStruct = data[w + 1][r]
                if nextWordStruct:

                    nextWord = nextWordStruct[0]
                    nextLang = nextWordStruct[1]

                    if currLang != nextLang:
                        label_vec.append(switch)
                        flag = 0
                        
                    elif flag >= 0:
                        
                        if flag == 4:
                            flag = -1
                        else:
                            flag += 1
                            
                        label_vec.append(switch)

                    else:
                        label_vec.append(noswitch)

                else:
                    label_vec.append(noswitch)

        labels.append(label_vec)
        examples.append(review_string)
    
    return examples, labels
    

In [63]:
examples_train, labels_train = createExamplesBinary(data_train)
examples_val, labels_val = createExamplesBinary(data_val)

In [64]:
tokenizer = Tokenizer(num_words=len(vectors), filters="", lower=False)
tokenizer.fit_on_texts(examples_train)
sequences_train = tokenizer.texts_to_sequences(examples_train)
sequences_val = tokenizer.texts_to_sequences(examples_val)

word_index = tokenizer.word_index
train_data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(sequences_val, maxlen=MAX_SEQUENCE_LENGTH)

In [65]:
embedding_dict = {}
for k,v in words.items():
    embedding_dict[k] = vectors[v]

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    embedding_matrix[i] = embedding_vector

In [66]:
le = preprocessing.LabelEncoder()
#le.fit([other, english, spanish])
le.fit([switch, noswitch])
label_transform_train = np.zeros((len(labels_train), MAX_SEQUENCE_LENGTH, 3))
for i, vec in enumerate(labels_train):

    curr = to_categorical(pad_sequences([le.transform(vec)], maxlen=MAX_SEQUENCE_LENGTH), num_classes = 3)[0]
    label_transform_train[i,:,:] = curr

label_transform_val = np.zeros((len(labels_val), MAX_SEQUENCE_LENGTH, 3))
for i, vec in enumerate(labels_val):

    curr = to_categorical(pad_sequences([le.transform(vec)], maxlen=MAX_SEQUENCE_LENGTH), num_classes = 3)[0]
    label_transform_val[i,:,:] = curr
    
#keys = list(le.classes_)
#vals = le.transform(keys)
#labels_index = dict(zip(keys,vals))

In [67]:
print(np.sum(np.argmax(label_transform_val.reshape(-1, 3), axis=1)))

3526


In [68]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
    
    def on_epoch_end(self, epoch, logs={}):
        val_predict = np.argmax((np.asarray(self.model.predict(val_data))).reshape(-1, 3), axis=1)
        val_targ = np.argmax(label_transform_val.reshape(-1, 3), axis=1)
        print(np.sum(val_predict))
        _val_f1 = f1_score(val_targ, val_predict, average='binary')
        _val_recall = recall_score(val_targ, val_predict, average='binary')
        _val_precision = precision_score(val_targ, val_predict, average='binary')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" % (_val_f1, _val_precision, _val_recall))
        return
    
metrics = Metrics()

In [69]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(200, return_sequences=True, name="LSTM"))
model.add(TimeDistributed(Dense(3, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#model.train_on_batch(data[4:8,:], label_transform[4:8,:,:])
#model.train_on_batch(data[8:12,:], label_transform[8:12,:,:])
#model.predict_on_batch(data[10:20,:])
results = model.fit(train_data, label_transform_train, epochs=6, validation_data = (val_data, label_transform_val), batch_size=100, callbacks=[metrics])
#results = model.fit(val_data, label_transform_val, epochs=6, validation_data = (val_data, label_transform_val), batch_size=100, callbacks=[metrics])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 61, 300)           1500300   
_________________________________________________________________
LSTM (LSTM)                  (None, 61, 200)           400800    
_________________________________________________________________
time_distributed_17 (TimeDis (None, 61, 3)             603       
Total params: 1,901,703
Trainable params: 401,403
Non-trainable params: 1,500,300
_________________________________________________________________
None
Train on 31967 samples, validate on 1000 samples
Epoch 1/6
11
— val_f1: 0.004524 — val_precision: 0.727273 — val_recall 0.002269
Epoch 2/6
726
— val_f1: 0.279398 — val_precision: 0.818182 — val_recall 0.168463
Epoch 3/6
717
— val_f1: 0.281405 — val_precision: 0.832636 — val_recall 0.169314
Epoch 4/6
 1600/31967 [>.............................] - ETA: 1:30 - loss: 0.0196 - acc: 0.993

KeyboardInterrupt: 

In [50]:
print(results.history)

{'val_loss': [0.12704526036977767, 0.11638797223567962, 0.10066358149051666, 0.1044807754456997, 0.10011781677603722, 0.09831734970211983], 'val_acc': [0.9491803169250488, 0.950803279876709, 0.9582131266593933, 0.9543934464454651, 0.9622622966766358, 0.9629508256912231], 'loss': [0.12747760998388658, 0.0682584139622943, 0.05653371496036498, 0.04541250440091707, 0.03997279140552525, 0.03578736492392362], 'acc': [0.9720187910311581, 0.9740167519624618, 0.9770362580386013, 0.9816419272653699, 0.9838609187973748, 0.9855701578742261]}
