In [16]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from keras.utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import Adam

In [2]:
data = pd.read_json('data.json')
num_reviews, review_length = data.shape
MAX_SEQUENCE_LENGTH = review_length

In [3]:
vectors = np.load("GloVe_codeswitch_5k.npy")
words = np.load('5k_vocab_dict.npy').item()
EMBEDDING_DIM = len(vectors[0])

In [5]:
examples = []
labels = []

english = "eng"
spanish = "span"
other = "other"

for r in range(num_reviews):
    review_string = ""
    label_vec = []
    
    for w in range(review_length):
        
        currWordStruct = data[w][r]
        
        if currWordStruct == None:
            break
        
        currWord = currWordStruct[0]
        
        if currWord in words:
            review_string += (" " + currWord)
        else:
            review_string += (" <UNK>")
            
        if w < (review_length - 1):
            nextWordStruct = data[w + 1][r]
            if nextWordStruct:
                
                nextWord = nextWordStruct[0]
                nextLang = nextWordStruct[1]
                
                if nextLang == 'eng':
                    label_vec.append(english)
                
                elif nextLang == 'spa':
                    label_vec.append(spanish)
                    
                elif nextLang == 'eng&spa' or 'eng+spa' or 'spa+eng':
                    label_vec.append(other)
                
            else:
                label_vec.append(other)
        
    labels.append(label_vec)
    examples.append(review_string)
    

In [6]:
tokenizer = Tokenizer(num_words=len(vectors), filters="", lower=False)
tokenizer.fit_on_texts(examples)
sequences = tokenizer.texts_to_sequences(examples)

word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [7]:
embedding_dict = {}
for k,v in words.items():
    embedding_dict[k] = vectors[v]

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    embedding_matrix[i] = embedding_vector

In [9]:
le = preprocessing.LabelEncoder()
le.fit([other, english, spanish])
label_transform = np.zeros((len(labels), MAX_SEQUENCE_LENGTH, 3))
for i, vec in enumerate(labels):

    curr = to_categorical(pad_sequences([le.transform(vec)], maxlen=MAX_SEQUENCE_LENGTH), num_classes = 3)[0]
    label_transform[i,:,:] = curr
    
keys = list(le.classes_)
vals = le.transform(keys)
labels_index = dict(zip(keys,vals))

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(200, return_sequences=True, name="LSTM"))
model.add(TimeDistributed(Dense(3, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#model.train_on_batch(data[4:8,:], label_transform[4:8,:,:])
#model.train_on_batch(data[8:12,:], label_transform[8:12,:,:])
#model.predict_on_batch(data[10:20,:])
results = model.fit(data, label_transform, epochs=6, validation_split = 0.15, batch_size=100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 61, 300)           1500300   
_________________________________________________________________
LSTM (LSTM)                  (None, 61, 200)           400800    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 61, 3)             603       
Total params: 1,901,703
Trainable params: 401,403
Non-trainable params: 1,500,300
_________________________________________________________________
None
Train on 28871 samples, validate on 5096 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6

In [14]:
print(results.history)

{'val_loss': [0.08338849254981782, 0.07181232466451601, 0.06613644710164429, 0.05767091639263897, 0.049922280551593756, 0.050932187525616894], 'val_acc': [0.9662190822641741, 0.9684065900848274, 0.9704139498376771, 0.9717071547613039, 0.977976941426096, 0.9758923737276667], 'loss': [0.13421004519972604, 0.07175802746289925, 0.06448034259272702, 0.056941839478953066, 0.047357822854551465, 0.04275989503309413], 'acc': [0.9714740146713962, 0.9730462992444424, 0.9743801012755489, 0.9767461938351978, 0.9810491101696619, 0.9828774785397599]}
