In [2]:
import keras
from keras.datasets import mnist
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Flatten, Embedding
from keras.layers import Conv1D, MaxPooling1D
from keras.layers.merge import Concatenate
from keras import backend as K
from w2v import train_word2vec 
import numpy as np
import difflib
import matplotlib.pyplot as plt

# ------------------------------- Data Preprocessing -----------------------------------

### Phrase -> index

In [3]:
phr_to_ind = dict()

with open('../../Datasets/SST1_dataset/dictionary.txt') as f:
    for line in f:
        entry = line.split('|')
        phr_to_ind[entry[0]] = int(entry[1])

keys = phr_to_ind.keys();

print(len(phr_to_ind), phr_to_ind['Good'])

239232 14058


### Getting Index corresponding to sentences

In [4]:
sentence_list = []
sentiment = []

with open('../../Datasets/SST1_dataset/datasetSentences.txt') as f:
    f.readline()
    for line in f:
        entry = line.split('\t')
        sent = entry[1][:-1]
        sent = sent.replace('-LRB-', '(')
        sent = sent.replace('-RRB-', ')')
    
        if sent in phr_to_ind.keys():
            sentiment.append(phr_to_ind[sent])
        else:
            print('.', end="")
            keys_subset = [k for k in keys if (k[0] == sent[0])]
            key = difflib.get_close_matches(sent, keys_subset, n=1);
            sent = key[0]
            sentiment.append(phr_to_ind[sent])
            
        sentence_list.append(sent)
        
print(len(sentence_list))

# Written the output in a file
f = open('../../Datasets/SST1_dataset/SentenceWithCorrection.txt', 'w')
for sent in sentence_list:
    f.write(sent + '\n')
f.close()

...........................................................................................................11855


### Phrase Index -> Sentiment

In [5]:
ind_to_senti = dict()

with open('../../Datasets/SST1_dataset/sentiment_labels.txt') as f:
    f.readline()
    for line in f:
        entry = line.split('|')
        ind_to_senti[int(entry[0])] = float(entry[1])

print(len(ind_to_senti))

239232


### Loading train, test and valid split info

In [7]:
split_ind = []
with open('../../Datasets/SST1_dataset/datasetSplit.txt') as f:
    f.readline()
    for line in f:
        entry = line.split(',')
        split_ind.append(int(entry[1]))

print(len(split_ind))

N_train = split_ind.count(1)
N_test = split_ind.count(2)
N_valid = split_ind.count(3)
print (N_train, N_test, N_valid)

11855
8544 2210 1101


### Assigning label to sentences

In [35]:
N_sent = len(sentence_list);
N_category = 5

y_label = []

for ind in sentiment:
    val = ind_to_senti[ind]
    if val >= 0.0 and val <= 0.2:
        y_label.append(0);
    elif val > 0.2 and val <= 0.4:
        y_label.append(1)
    elif val > 0.4 and val <= 0.6:
        y_label.append(2)
    elif val > 0.6 and val <= 0.8:
        y_label.append(3)
    else:
        y_label.append(4)

print(y_label.count(0), y_label.count(1), y_label.count(2), y_label.count(3))

# Labels in one-hot encoding
y_train = np.zeros((N_train, N_category), np.uint8)
y_test  = np.zeros((N_test , N_category), np.uint8)
y_valid = np.zeros((N_valid, N_category), np.uint8)

c1,c2,c3 = 0,0,0
for i in range(len(y_label)):
    label = y_label[i]
    if split_ind[i] == 1:
        y_train[c1, label] = 1;  c1 += 1
    elif split_ind[i] == 2:
        y_test [c2, label] = 1;  c2 += 1
    else:
        y_valid[c3, label] = 1;  c3 += 1

1510 3140 2242 3111


### Reducing the size of vocabulary

In [8]:
x_all = []
max_sent_len = -1;
max_wrd_len = -1
wrd_to_ind = dict()

for sent in sentence_list:
    wrds = sent.split()
    vec = []
    for wrd in wrds:
        ind = phr_to_ind[wrd]
        wrd_to_ind[wrd] = ind
        vec.append(ind)
            
    max_sent_len = max(len(vec), max_sent_len)
    x_all.append(vec)

# Get inverse dictionary
ind_to_wrd = dict((v, k) for k, v in wrd_to_ind.items())
ind_to_wrd[99999] = "<PAD/>"

print(len(phr_to_ind), len(wrd_to_ind))

239232 21699


### Create input features

In [9]:
x_train = np.zeros((N_train, max_sent_len), np.uint32)
x_test  = np.zeros((N_test,  max_sent_len), np.uint32)
x_valid = np.zeros((N_valid, max_sent_len), np.uint32)

c1, c2, c3 = 0,0,0
for i in range(len(x_all)):
    vec = x_all[i]
    if split_ind[i] == 1:
        x_train[c1,0:len(vec)] = np.uint32(vec).T; 
        c1 += 1
    elif split_ind[i] == 2:
        x_test [c2,0:len(vec)] = np.uint32(vec).T; 
        c2 += 1
    else:
        x_valid[c3,0:len(vec)] = np.uint32(vec).T; 
        c3 += 1

print(c1, c2, c3)

8544 2210 1101


# -------------------------------- Training model  -----------------------------------

### Model Paremeters

In [25]:
model_type    = 'CNN-rand'  # CNN-rand|CNN-non-static|CNN-static
embedding_dim = 300         # word2vec dim
vocab_size    = len(ind_to_wrd)

### Generate word2vec 

In [13]:
if model_type in ['CNN-non-static', 'CNN-static']:
    embedding_wts = train_word2vec( np.vstack((x_train, x_test)), 
                                    ind_to_wrd, num_features = embedding_dim)
    if model_type == 'CNN-static':
        x_train = embedding_wts[0][x_train]
        x_test  = embedding_wts[0][x_test]
        
elif model_type == 'CNN-rand':
    embedding_wts = None
    
else:
    raise ValueError("Unknown model type")

In [38]:
batch_size = 50
filter_sizes = [3,4,5]
num_filters = 100
dropout_prob = (0.5, 0.8)
hidden_dims = 50

l2_reg = 0.3
embedding_dim = 300

model_input = Input(shape= (max_sent_len,))

z = Embedding(vocab_size+1, embedding_dim, input_length=max_sent_len, name="embedding")(model_input)
z = Dropout(dropout_prob[0])(z)

# Convolution layers
z1 = Conv1D( filters=100, kernel_size=3, padding="valid", activation="relu", strides=1)(z)
z1 = MaxPooling1D(pool_size=2)(z1)
z1 = Flatten()(z1)

z2 = Conv1D( filters=100, kernel_size=4, padding="valid", activation="relu", strides=1)(z)
z2 = MaxPooling1D(pool_size=2)(z2)
z2 = Flatten()(z2)

z3 = Conv1D( filters=100, kernel_size=5, padding="valid", activation="relu", strides=1)(z)
z3 = MaxPooling1D(pool_size=2)(z3)
z3 = Flatten()(z3)

# Concatenate the output of all convolution layers
z = Concatenate()([z1, z2, z3])
z = Dropout(dropout_prob[1])(z)

z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(N_category, activation="softmax")(z)
    
model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 56)            0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 56, 300)       6510300                                      
____________________________________________________________________________________________________
dropout_9 (Dropout)              (None, 56, 300)       0                                            
____________________________________________________________________________________________________
conv1d_13 (Conv1D)               (None, 54, 100)       90100                                        
___________________________________________________________________________________________

In [39]:
if model_type == "CNN-non-static":
    embedding_layer = model.get_layer("embedding")
    embedding_layer.set_weights(embedding_wts)

model.fit(x_train, y_train, 
          batch_size = batch_size, 
          epochs=10,
          validation_data=(x_test, y_test), verbose=2)

Train on 8544 samples, validate on 2210 samples
Epoch 1/10
25s - loss: 0.4907 - acc: 0.8000 - val_loss: 0.4893 - val_acc: 0.8000
Epoch 2/10
23s - loss: 0.4679 - acc: 0.8001 - val_loss: 0.4531 - val_acc: 0.8022
Epoch 3/10
25s - loss: 0.4234 - acc: 0.8105 - val_loss: 0.4515 - val_acc: 0.8003
Epoch 4/10
26s - loss: 0.3844 - acc: 0.8295 - val_loss: 0.4507 - val_acc: 0.7957
Epoch 5/10
24s - loss: 0.3508 - acc: 0.8455 - val_loss: 0.4681 - val_acc: 0.7932
Epoch 6/10
24s - loss: 0.3158 - acc: 0.8627 - val_loss: 0.4853 - val_acc: 0.7924
Epoch 7/10
24s - loss: 0.2896 - acc: 0.8761 - val_loss: 0.5103 - val_acc: 0.7862
Epoch 8/10
24s - loss: 0.2629 - acc: 0.8889 - val_loss: 0.5396 - val_acc: 0.7782
Epoch 9/10
23s - loss: 0.2429 - acc: 0.8981 - val_loss: 0.5747 - val_acc: 0.7757
Epoch 10/10
24s - loss: 0.2225 - acc: 0.9083 - val_loss: 0.6030 - val_acc: 0.7704


<keras.callbacks.History at 0x7f9ea40398d0>