In [1]:
import pandas as pd
import numpy as np
from keras.layers import Dense,Dropout,Conv1D,LSTM,MaxPooling1D,GlobalAveragePooling1D
from keras.models import Sequential

Using TensorFlow backend.


### Loading the Glove Embedding

In [2]:
f = open('../glove.6B.50d.txt',encoding='utf-8')

### Creating a dictionary with the word as the key and 50 dimensional embedding as it's value

In [3]:
embedding_index = {}

for line in f:
    values = line.split()
    word = values[0]
    coefs =np.asarray(values[1:],dtype='float')
    embedding_index[word] = coefs
f.close()

### Loading the data

In [4]:
#load train and labels
data = np.load('train.npy')
labels = np.load('labels.npy')

In [5]:
from keras.utils import to_categorical
labels = to_categorical(labels,num_classes=2)

In [6]:
labels.shape

(20800, 2)

### Splitting the data into training data and testing data

In [7]:
#train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data,labels,random_state=2,shuffle=True)

In [8]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((15600,), (5200,), (15600, 2), (5200, 2))

In [9]:
mean = [len(x) for x in x_train]
mean.sort()
mean[-50:]

[3257,
 3287,
 3400,
 3489,
 3570,
 3570,
 3637,
 3644,
 3682,
 3715,
 3725,
 3760,
 3796,
 3924,
 3933,
 3936,
 3955,
 4076,
 4154,
 4257,
 4274,
 4283,
 4291,
 4291,
 4304,
 4350,
 4435,
 4498,
 4513,
 4515,
 4737,
 4760,
 4815,
 4821,
 4885,
 5114,
 5162,
 5190,
 5546,
 5593,
 5799,
 6717,
 6742,
 7686,
 7849,
 8854,
 9196,
 10422,
 10696,
 12185]

We observe that the most of the values are within 3000 words mark. Therefore while training the model, we'll take the max number of words in the embedding to be 3000 so as to reduce the computation time

### Converting the words in our corpus to embeddings obtained from the Glove Vector

In [63]:
def embedding_output(X):
    emb_dim = 50
    maxLen = 3000
    
    embedding_output = np.zeros((len(X),maxLen,emb_dim)) #(batch_size,max len of sentence,embedding dimension)
    for i in range(len(X)):
        for j in range(min(len(X[i]),3000)):
            try:
                embedding_output[i][j] = embedding_index[X[i][j]]
            except:
                embedding_output[i][j] = np.zeros((50,))
    return embedding_output

In [65]:
embedded_train = embedding_output(x_train)
embedded_test = embedding_output(x_test)

MemoryError: 

# Model

## Things that can be done

So, as we know, the number of words in a text example is quite high, therefore simply using LSTM might take some time.
We can do the following to reduce computation time:

- use CuDNN LSTM
- use LSTM + 1D CNN. Here the text is passed through a CNN layer. Depending on the stride chosen, the features are reduced.(i.e if we choose stride as 3 and we have 3k features, the resulting output of this layer would be 1k, therefore the data is reduced by 3 times). Then pass the output through the LSTM layer. I tried this type of model for this problem, but i was getting a pretty low accuracy
- Simply use 1D CNN. As our data is just a binary classification, it wouldn't matter a lot if
we lose the dependencies between the data. Moreover, using cnn will also prevent the problem of vanishing gradients(if present)

### 1D CNN model

In [90]:
model = Sequential()
model.add(Conv1D(64,3,input_shape=(3000,50),activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.2))
model.add(Conv1D(64,3,activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Conv1D(64,3,activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.2))
#model.add(LSTM(128))
#model.add(Dropout(0.5))
model.add(GlobalAveragePooling1D())
model.add(Dense(128,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_21 (Conv1D)           (None, 2998, 64)          9664      
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, 999, 64)           0         
_________________________________________________________________
dropout_19 (Dropout)         (None, 999, 64)           0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 997, 64)           12352     
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 332, 64)           0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 330, 64)           12352     
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 110, 64)           0         
__________

In [93]:
from keras.callbacks import ModelCheckpoint
#checkpoint = ModelCheckpoint("model.h5", monitor='val_loss', verbose=1, save_best_only=True, period=1)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
hist = model.fit(embedded_train,y_train,validation_split=0.2,epochs=2)

Train on 12480 samples, validate on 3120 samples
Epoch 1/2
Epoch 2/2


In [94]:
model.evaluate(embedded_test,y_test)



[0.2011418330210906, 0.9207692307692308]

### Testing accuracy: .92
Can be improved my training our model for more epochs and tuning the hyperparameters