DNN with TFLearn

From here: https://github.com/felixcom/sentiment-analysis/blob/master/Sentiment%20Analysis%20with%20TFLearn%20-%20Solution.ipynb

In [None]:
import tflearn
import tensorflow as tf
from tflearn.data_utils import to_categorical, pad_sequences
import numpy as np
import pandas as pd
import helpers as h

In [None]:
path_train_pos = "data/train_pos.txt"
path_train_pos_full = "data/train_pos_full.txt"
path_train_neg = "data/train_neg.txt"
path_train_neg_full = "data/train_neg_full.txt"
path_test = "data/test_data.txt"

In [None]:
list_train_pos = h.txt_to_list_train(path_train_pos)
list_train_neg = h.txt_to_list_train(path_train_neg)

In [None]:
X = np.array(list_train_pos + list_train_neg)
Y = np.array([1]*len(list_train_pos) + [0]*len(list_train_neg))

In [None]:
Y[-1]

In [None]:
from collections import Counter
total_counts = Counter()
for line in X:
    total_counts.update(line.split(' '))
print("Total words in data set:", len(total_counts))

In [None]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:10000]
print(vocab[-5:])

In [None]:
print(vocab[-1], ': ', total_counts[vocab[-1]])

In [None]:
word2idx = {word: i for i, word in enumerate(vocab)}
print(list(word2idx.keys())[:10])

In [None]:
def text_to_vector(text):
    word_vector = np.zeros(len(vocab), dtype=np.int_)
    for word in text.split(' '):
        idx = word2idx.get(word, None)
        if idx is None:
            continue
        else:
            word_vector[idx] += 1
    return np.array(word_vector)

In [None]:
text_to_vector('The tea is for a party to celebrate '
               'the movie so she has no time for a cake')[:65]

In [None]:
def vectorize(X):
    word_vectors = np.zeros((len(X), len(vocab)), dtype=np.int_)
    for i, text in enumerate(X):
        word_vectors[i] = text_to_vector(text)
    return word_vectors

In [None]:
word_vectors = vectorize(X)

In [None]:
word_vectors[-5:]

In [None]:
twits = len(Y)

shuffle = np.arange(twits)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(twits*test_fraction)], shuffle[int(twits*test_fraction):]
trainX, trainY = word_vectors[train_split, :], to_categorical(Y[train_split], 2)
testX, testY = word_vectors[test_split, :], to_categorical(Y[test_split], 2)

In [None]:
Y[train_split][:20]

In [None]:
trainY[:20]

In [None]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    # Inputs
    net = tflearn.input_data([None, 10000])

    # Hidden layer(s)
    net = tflearn.fully_connected(net, 200, activation='ReLU')
    net = tflearn.fully_connected(net, 25, activation='ReLU')

    # Output layer
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='sgd', 
                             learning_rate=0.1, 
                             loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

In [None]:
model = build_model()

In [None]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=10)

In [None]:
model.save('models/naive')

In [None]:
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

In [None]:
list_test = np.array(h.txt_to_list_test(path_test))

In [None]:
list_test[:, 1]

In [None]:
real_predictions = (np.array(model.predict(vectorize(list_test[:, 1])))[:,0] >= 0.5).astype(np.int_)

In [None]:
real_predictions[real_predictions == 0] = -1

In [None]:
real_predictions[:10]

In [None]:
submission = np.array([list_test[:, 0], real_predictions]).T
subdf = pd.DataFrame(submission)
subdf.columns = ['Id', 'Prediction']
subdf.to_csv('submissions/naive.csv', index=False)