In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from keras.utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import SGD

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_json('data.json')
num_reviews, review_length = data.shape
MAX_SEQUENCE_LENGTH = review_length

In [3]:
vectors = np.load("GloVe_codeswitch_5k.npy")
words = np.load('5k_vocab_dict.npy').item()
EMBEDDING_DIM = len(vectors[0])

In [4]:
examples = []
labels = []

english = "eng"
spanish = "span"
both = "both"
other = "other"

for r in range(num_reviews):
    review_string = ""
    label_vec = []
    
    for w in range(review_length):
        
        currWordStruct = data[w][r]
        
        if currWordStruct == None:
            break
        
        currWord = currWordStruct[0]
        
        if currWord in words:
            review_string += (" " + currWord)
        else:
            review_string += (" <UNK>")
            
        if w == 0:
            label_vec.append(other)
            
        elif w < (review_length - 1):
            nextWordStruct = data[w + 1][r]
            if nextWordStruct:
                
                nextWord = nextWordStruct[0]
                nextLang = nextWordStruct[2]
                
                if nextWord == '<end>':
                    label_vec.append(other)
                
                elif nextLang == 'eng':
                    label_vec.append(english)
                
                elif nextLang == 'spa':
                    label_vec.append(spanish)
                    
                elif nextLang == 'eng&spa':
                    label_vec.append(both)
                
            else:
                label_vec.append(other)
        
    labels.append(label_vec)
    examples.append(review_string)
    

In [5]:
tokenizer = Tokenizer(num_words=len(vectors))
tokenizer.fit_on_texts(examples)
sequences = tokenizer.texts_to_sequences(examples)

word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [26]:
embedding_dict = {}
for k,v in words.items():
    embedding_dict[k] = vectors[v]

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    embedding_matrix[i] = embedding_vector

np.nan_to_num(embedding_matrix, copy=False)

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.25178444,  0.37080526,  0.08665863, ..., -0.66401535,
         0.37150335,  0.1546663 ],
       [ 0.14598137,  0.41574574,  0.2100179 , ..., -0.56283247,
         0.0695602 , -0.02747385],
       ...,
       [ 1.0565089 ,  0.91580164, -0.05337093, ...,  0.13330264,
        -0.32603508,  0.19502085],
       [ 0.85473436,  0.00469674,  0.06053996, ...,  0.02327155,
         0.03136352, -0.51760447],
       [ 0.26791692, -0.15173423,  0.22812903, ..., -0.84134376,
        -0.47550845,  0.11521012]])

In [7]:
le = preprocessing.LabelEncoder()
le.fit([other, english, spanish, both])
label_transform = np.zeros((len(labels), MAX_SEQUENCE_LENGTH, 4))
for i, vec in enumerate(labels):

    curr = to_categorical(pad_sequences([le.transform(vec)], maxlen=MAX_SEQUENCE_LENGTH), num_classes = 4)[0]
    label_transform[i,:,:] = curr
    
keys = list(le.classes_)
vals = le.transform(keys)
labels_index = dict(zip(keys,vals))

In [8]:
print(data[0,:])
print(label_transform[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    1  106   37   33  420
   16  100   61  127   71 4632    2]
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1.

In [27]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(10, return_sequences=True, name="LSTM"))
model.add(TimeDistributed(Dense(4, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=1e-10), metrics=['accuracy'])
print(model.summary())
#model.train_on_batch(data[4:8,:], label_transform[4:8,:,:])
#model.train_on_batch(data[8:12,:], label_transform[8:12,:,:])
#model.predict_on_batch(data[10:20,:])
#results = model.fit(data, label_transform, epochs=6, batch_size=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 63, 300)           1487400   
_________________________________________________________________
LSTM (LSTM)                  (None, 63, 10)            12440     
_________________________________________________________________
time_distributed_12 (TimeDis (None, 63, 4)             44        
Total params: 1,499,884
Trainable params: 12,484
Non-trainable params: 1,487,400
_________________________________________________________________
None
[array([[-0.12714738,  0.08726095,  0.07321967, ..., -0.0651332 ,
        -0.03373115,  0.02420714],
       [-0.08602007, -0.08793683, -0.09980378, ...,  0.06006789,
        -0.08801012,  0.06769709],
       [ 0.09752731,  0.05640312, -0.049805  , ...,  0.11906888,
         0.12866016, -0.05632498],
       ...,
       [ 0.01748377, -0.08777739,  0.09093902, ...,  0.11374256,
        

In [25]:
print(data[4])
for val in data[4]:
    print(embedding_matrix)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
  10 107   3  69 604 339  69 370   2]
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.25178444  0.37080526  0.08665863 ... -0.66401535  0.37150335
   0.1546663 ]
 [ 0.14598137  0.41574574  0.2100179  ... -0.56283247  0.0695602
  -0.02747385]
 ...
 [ 1.0565089   0.91580164 -0.05337093 ...  0.13330264 -0.32603508
   0.19502085]
 [ 0.85473436  0.00469674  0.06053996 ...  0.02327155  0.03136352
  -0.51760447]
 [ 0.26791692 -0.15173423  0.22812903 ... -0.84134376 -0.47550845
   0.11521012]]
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.25178444  0.37080526  0.08665863 ... -0.66401535  0.37150335
   0.1546663 ]
 [ 0.14598137  0.41574574  0.2100179  ... -0.56283247  0.0695602
  -0.02747385]
 ...
 [ 1.0565089   0.9158