# Multi-Layer Perceptron Sentiment Analysis Model

This model Performs classification using a multi-layer perceptron with 1 hidden layer of Neural Nets

In [1]:
from keras.models import Sequential, load_model
from keras.layers import Dense
import sys
import utils
import random
import numpy as np

Using TensorFlow backend.


In [2]:
FREQ_DIST_FILE = 'Train-freqdist.pkl'
BI_FREQ_DIST_FILE = 'Train-freqdist-bi.pkl'
TRAIN_PROCESSED_FILE = 'Train-processed.csv'
TEST_PROCESSED_FILE = 'Test-processed.csv'
TRAIN= True
UNIGRAM_SIZE = 15000
VOCAB_SIZE = UNIGRAM_SIZE

USE_BIGRAMS = False

if USE_BIGRAMS:
    BIGRAM_SIZE = 10000
    VOCAB_SIZE = UNIGRAM_SIZE + BIGRAM_SIZE
FEAT_TYPE = 'frequency'

In [3]:
def get_feature_vector(tweet):
    uni_feature_vector = []
    bi_feature_vector = []
    words = tweet.split()
    for i in range(len(words) - 1):
        word = words[i]
        next_word = words[i + 1]
        if unigrams.get(word):
            uni_feature_vector.append(word)
        if USE_BIGRAMS:
            if bigrams.get((word, next_word)):
                bi_feature_vector.append((word, next_word))
    if len(words) >= 1:
        if unigrams.get(words[-1]):
            uni_feature_vector.append(words[-1])
    return uni_feature_vector, bi_feature_vector

In [4]:
def extract_features(tweets, batch_size=500, test_file=True, feat_type='presence'):
    num_batches = int(np.ceil(len(tweets) / float(batch_size)))
    for i in range(num_batches):
        batch = tweets[i * batch_size: (i + 1) * batch_size]
        features = np.zeros((batch_size, VOCAB_SIZE))
        labels = np.zeros(batch_size)
        for j, tweet in enumerate(batch):
            if test_file:
                tweet_words = tweet[1][0]
                tweet_bigrams = tweet[1][1]
            else:
                tweet_words = tweet[2][0]
                tweet_bigrams = tweet[2][1]
                labels[j] = tweet[1]
            if feat_type == 'presence':
                tweet_words = set(tweet_words)
                tweet_bigrams = set(tweet_bigrams)
            for word in tweet_words:
                idx = unigrams.get(word)
                if idx:
                    features[j, idx] += 1
            if USE_BIGRAMS:
                for bigram in tweet_bigrams:
                    idx = bigrams.get(bigram)
                    if idx:
                        features[j, UNIGRAM_SIZE + idx] += 1
        yield features, labels

In [5]:
def process_tweets(csv_file, test_file=True):
    """Returns a list of tuples of type (tweet_id, feature_vector)
            or (tweet_id, sentiment, feature_vector)
    Args:
        csv_file (str): Name of processed csv file generated by preprocess.ipynb
        test_file (bool, optional): If processing test file
    Returns:
        list: Of tuples
    """
    tweets = []
    print ('Generated feature vectors')
    with open(csv_file, 'r', encoding='utf-8') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if test_file:
                tweet_id, tweet = line.split(',')
            else:
                tweet_id, sentiment, tweet = line.split(',')
            feature_vector = get_feature_vector(tweet)
            if test_file:
                tweets.append((tweet_id, feature_vector))
            else:
                tweets.append((tweet_id, int(sentiment), feature_vector))
    
    return tweets

In [6]:
def build_model():
    model = Sequential()
    model.add(Dense(500, input_dim=VOCAB_SIZE, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
def evaluate_model(model, val_tweets):
    correct, total = 0, len(val_tweets)
    for val_set_X, val_set_y in extract_features(val_tweets, feat_type=FEAT_TYPE, test_file=False):
        prediction = model.predict_on_batch(val_set_X)
        prediction = np.round(prediction)
        correct += np.sum(prediction == val_set_y[:, None])
    return float(correct) / total

In [8]:
def top_n_words(pkl_file_name, N, shift=0):
    """
    Returns a dictionary of form {word:rank} of top N words from a pickle
    file which has a nltk FreqDist object generated by stats.py
    Args:
        pkl_file_name (str): Name of pickle file
        N (int): The number of words to get
        shift: amount to shift the rank from 0.
    Returns:
        dict: Of form {word:rank}
    """
    import pickle
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    words = {p[0]: i + shift for i, p in enumerate(most_common)}
    return words

In [9]:
def top_n_bigrams(pkl_file_name, N, shift=0):
    """
    Returns a dictionary of form {bigram:rank} of top N bigrams from a pickle
    file which has a Counter object generated by stats.py
    Args:
        pkl_file_name (str): Name of pickle file
        N (int): The number of bigrams to get
        shift: amount to shift the rank from 0.
    Returns:
        dict: Of form {bigram:rank}
    """
    import pickle
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    bigrams = {p[0]: i for i, p in enumerate(most_common)}
    return bigrams

In [10]:
def split_data(tweets, validation_split=0.1):
    """Split the data into training and validation sets
    Args:
        tweets (list): list of tuples
        validation_split (float, optional): validation split %
    Returns:
        (list, list): training-set, validation-set
    """
    index = int((1 - validation_split) * len(tweets))
    random.shuffle(tweets)
    return tweets[:index], tweets[index:]

In [11]:
unigrams = top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
if USE_BIGRAMS:
    bigrams = top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
if TRAIN:
    train_tweets, val_tweets = split_data(tweets)
else:
    random.shuffle(tweets)
    train_tweets = tweets
del tweets
print ('Extracted features & training batches')
nb_epochs = 5
batch_size = 500
model = build_model()
n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
best_val_acc = 0.0

for j in range(nb_epochs):
        i = 1
        for training_set_X, training_set_y in extract_features(train_tweets, feat_type=FEAT_TYPE, batch_size=batch_size, test_file=False):
            o = model.train_on_batch(training_set_X, training_set_y)
            sys.stdout.write('\rIteration %d/%d, loss:%.4f, acc:%.4f' %
                             (i, n_train_batches, o[0], o[1]))
            sys.stdout.flush()
            i += 1
        val_acc = evaluate_model(model, val_tweets)
        print ('\nEpoch: %d, val_acc:%.4f' % (j + 1, val_acc))
        random.shuffle(train_tweets)
        if val_acc > best_val_acc:
            print ('Accuracy improved from %.4f to %.4f, saving model' % (best_val_acc, val_acc))
            best_val_acc = val_acc
            model.save('best_model.h5')

Generated feature vectors
Extracted features & training batches
Iteration 58/58, loss:0.1858, acc:0.9520
Epoch: 1, val_acc:1.0194
Accuracy improved from 0.0000 to 1.0194, saving model
Iteration 58/58, loss:0.1213, acc:0.9760
Epoch: 2, val_acc:1.0278
Accuracy improved from 1.0194 to 1.0278, saving model
Iteration 58/58, loss:0.1066, acc:0.9780
Epoch: 3, val_acc:1.0332
Accuracy improved from 1.0278 to 1.0332, saving model
Iteration 58/58, loss:0.1077, acc:0.9740
Epoch: 4, val_acc:1.0369
Accuracy improved from 1.0332 to 1.0369, saving model
Iteration 58/58, loss:0.0953, acc:0.9780
Epoch: 5, val_acc:1.0416
Accuracy improved from 1.0369 to 1.0416, saving model


In [12]:
def save_results_to_csv(results, csv_file):
    ''' Save list of type [(tweet_id, positive)] to csv in Kaggle format '''
    with open(csv_file, 'w',encoding = 'utf-8') as csv:
        csv.write('id,prediction\n')
        for tweet_id, pred in results:
            csv.write(tweet_id)
            csv.write(',')
            csv.write(str(pred))
            csv.write('\n')

In [13]:
#Testing

print ('Testing')
del train_tweets
del model
model = load_model('best_model.h5')
test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
predictions = np.array([])
i = 1
for test_set_X, _ in extract_features(test_tweets, feat_type=FEAT_TYPE, batch_size=batch_size, test_file=True):
    prediction = np.round(model.predict_on_batch(test_set_X).flatten())
    predictions = np.concatenate((predictions, prediction))
    i += 1
predictions = [(str(j), int(predictions[j]))
                for j in range(len(test_tweets))]
save_results_to_csv(predictions, '1layerneuralnet.csv')
print ('\nSaved to Multi-layerneuralnet.csv')

Testing
Generated feature vectors

Saved to Multi-layerneuralnet.csv
