# Neural networks

## Sentiment analysis with Scikit-learn

Predict album review sentiment from the [Kaggle Pitchfork dataset](https://www.kaggle.com/nolanbconaway/pitchfork-data/data).

Need to install three new libraries:

```
conda install keras tensorflow python-slugify
```

In [1]:
# Imports and setup
import gensim
import multiprocessing as mp
import nltk
import numpy as np
import os
import pickle
import sqlite3
import unicodedata

from gensim.matutils import sparse2full, full2sparse, full2sparse_clipped, scipy2scipy_clipped
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from itertools import groupby
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.layers.embeddings import Embedding
from keras.models import load_model, Sequential
from keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.api import CorpusReader
from nltk.chunk import tree2conlltags
from nltk.probability import FreqDist
from nltk.chunk.regexp import RegexpParser
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from slugify import slugify
from unicodedata import category as unicat

# Local paths
pitchfork_dir = os.path.join('..', 'data', 'pitchfork')
sql_file = os.path.join(pitchfork_dir, 'database.sqlite')
pickle_dir = os.path.join(pitchfork_dir, 'pickled')
pitchfork_model = os.path.join(pitchfork_dir, 'pitchfork_clf.pkl')

Using TensorFlow backend.


### Readers and utilities

In [2]:
PKL_PATTERN = r'(?!\.)[\w\s\d\-]+\.pickle'

class PickledReviewsReader(CorpusReader):
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader
        """
        CorpusReader.__init__(self, root, fileids, **kwargs)

    def texts_scores(self, fileids=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the SqliteCorpusReader, this uses a generator
        to achieve memory safe iteration.
        """
        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def reviews(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for text,score in self.texts_scores(fileids):
            yield text

    def scores(self, fileids=None):
        """
        Return the scores
        """
        for text,score in self.texts_scores(fileids):
            yield score

    def paras(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for review in self.reviews(fileids):
            for paragraph in review:
                yield paragraph

    def sents(self, fileids=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids):
            for sentence in paragraph:
                yield sentence

    def tagged(self, fileids=None):
        for sent in self.sents(fileids):
            for token in sent:
                yield token

    def words(self, fileids=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for token in self.tagged(fileids):
            yield token[0]

### Transformers

In [3]:
class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def is_punct(self, token):
        return all(
            unicodedata.category(char).startswith('P') for char in token
        )

    def is_stopword(self, token):
        return token.lower() in self.stopwords

    def normalize(self, document):
        return [
            self.lemmatize(token, tag).lower()
            for sentence in document
            for (token, tag) in sentence
            if not self.is_punct(token)
               and not self.is_stopword(token)
        ]

    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        return [
            ' '.join(self.normalize(doc)) for doc in documents
        ]


class GensimDoc2Vectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, size=5, min_count=3):
        """
        gensim_doc2vec_vectorize
        """
        self.size = size
        self.min_count = min_count

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        docs = [
            TaggedDocument(words, ['d{}'.format(idx)])
            for idx, words in enumerate(documents)
        ]
        model = Doc2Vec(docs, size=self.size, min_count=self.min_count)
        return np.array(list(model.docvecs))

class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, nfeatures=100, tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = "lexigram.dict"
        self._tfidf_path = "tfidf.model"
        self.nfeatures = nfeatures
        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):
        if os.path.exists(self._lexicon_path):
            self.lexicon = gensim.corpora.Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = gensim.models.TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = gensim.corpora.Dictionary(documents, prune_at=self.nfeatures)
        self.lexicon.filter_extremes(keep_n=self.nfeatures)
        self.lexicon.compactify()
        self.tfidf = gensim.models.TfidfModel(
            [self.lexicon.doc2bow(doc) for doc in documents],
            id2word=self.lexicon
        )
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec, len(self.lexicon))
                else:
                    yield vec
        return np.array(list(generator()))


class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Extract adverbial and adjective phrases, and transform
    documents into lists of these keyphrases, with a total
    keyphrase lexicon limited by the nfeatures parameter
    and a document length limited/padded to doclen
    """
    def __init__(self, nfeatures=100000, doclen=60):
        self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}'
        # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}'
        # self.grammar = r'KT: {<RB.>|<JJ.>}'
        self.chunker = RegexpParser(self.grammar)
        self.nfeatures = nfeatures
        self.doclen = doclen

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(c).startswith('P') for c in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_candidate_phrases(self, sents):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Extract phrases, rejoin with a space, and yield the document
        represented as a list of it's keyphrases.
        """
        for sent in sents:
            sent = self.normalize(sent)
            if not sent: continue
            chunks = tree2conlltags(self.chunker.parse(sent))
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(
                    chunks, lambda term: term[-1] != 'O'
                ) if key
            ]
            for phrase in phrases:
                yield phrase

    def fit(self, documents, y=None):
        return self

    def get_lexicon(self, keydocs):
        """
        Build a lexicon of size nfeatures
        """
        keyphrases = [keyphrase for doc in keydocs for keyphrase in doc]
        fdist = FreqDist(keyphrases)
        counts = fdist.most_common(self.nfeatures)
        lexicon = [phrase for phrase, count in counts]
        return {phrase: idx+1 for idx, phrase in enumerate(lexicon)}

    def clip(self, keydoc, lexicon):
        """
        Remove keyphrases from documents that aren't in the lexicon
        """
        return [lexicon[keyphrase] for keyphrase in keydoc
                if keyphrase in lexicon.keys()]

    def transform(self, documents):
        docs = [list(self.extract_candidate_phrases(doc)) for doc in documents]
        lexicon = self.get_lexicon(docs)
        clipped = [list(self.clip(doc, lexicon)) for doc in docs]
        return sequence.pad_sequences(clipped, maxlen=self.doclen)

### Cross-validate review sentiment

In [4]:
# Scoring function and utilities
def documents(corpus):
    return list(corpus.reviews())

def continuous(corpus):
    return list(corpus.scores())

def make_categorical(corpus):
    """
    terrible : 0.0 < y <= 3.0
    okay     : 3.0 < y <= 5.0
    great    : 5.0 < y <= 7.0
    amazing  : 7.0 < y <= 10.1
    :param corpus:
    :return:
    """
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.0, 7.0, 10.1])

def train_model(path, model, continuous=True, saveto=None, cv=12):
    """
    Trains model from corpus at specified path; constructing cross-validation
    scores using the cv parameter, then fitting the model on the full data and
    writing it to disk at the saveto path if specified. Returns the scores.
    """
    # Load the corpus data and labels for classification
    corpus = PickledReviewsReader(path)
    X = documents(corpus)
    if continuous:
        y = continuous(corpus)
        scoring = 'r2'
    else:
        y = make_categorical(corpus)
        scoring = 'f1_weighted'

    # Compute cross validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)

    # Fit the model on entire data set
    model.fit(X, y)

    # Write to disk if specified
    if saveto:
        joblib.dump(model, saveto)

    # Return scores as well as training time via decorator
    return scores

## Deep learning with Keras and TensorFlow

In [5]:
# Functions and settings
N_FEATURES = 10000
DOC_LEN = 500
N_CLASSES = 4

def build_nn():
    """
    Create a function that returns a compiled neural network
    :return: compiled Keras neural network model
    """
    nn = Sequential()
    nn.add(Dense(500, activation='relu', input_shape=(N_FEATURES,)))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return nn

def build_lstm():
    lstm = Sequential()
    lstm.add(Embedding(N_FEATURES+1, 128, input_length=DOC_LEN))
    lstm.add(Dropout(0.4))
    lstm.add(LSTM(units=200, recurrent_dropout=0.2, dropout=0.2))
    lstm.add(Dropout(0.2))
    lstm.add(Dense(N_CLASSES, activation='sigmoid'))
    lstm.compile(
        loss='categorical_crossentropy', # b/c target vals are 1 or 2
        optimizer='adam',
        metrics=['accuracy']
    )
    return lstm

def train_nn_model(path, model, reader, saveto=None, cv=12, **kwargs):
    """
    Trains model from corpus at specified path;
    fitting the model on the full data and
    writing it to disk at the saveto directory if specified.
    Returns the scores.
    """
    # Load the corpus data and labels for classification
    corpus = reader(path)
    X = documents(corpus)
    y = make_categorical(corpus) # for Pitchfork

    # Compute cross validation scores
    # mp note: http://scikit-learn.org/stable/faq.html#why-do-i-sometime-get-a-crash-freeze-with-n-jobs-1-under-osx-or-linux
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

    # Fit the model on entire data set
    model.fit(X, y)

    # Write to disk if specified
    if saveto:
        # have to save the keras part using keras' save method
        model.steps[-1][1].model.save(saveto['keras_model'])
        model.steps.pop(-1)
        # ... and use joblib to save the rest of the pipeline
        joblib.dump(model, saveto['sklearn_pipe'])

    # Return scores as well as training time via decorator
    return scores

In [8]:
%%time
# Build a Keras Sequential model for the Pitchfork reviews
cpath = pickle_dir
mpath = {
    'keras_model'  : os.path.join(pitchfork_dir, 'keras_nn.h5'),
    'sklearn_pipe' : os.path.join(pitchfork_dir, 'nn_pipeline.pkl')
}

pipeline = Pipeline([
    ('norm', TextNormalizer()),
    ('vect', TfidfVectorizer(max_features=N_FEATURES)), # need to control feature count
    ('nn', KerasClassifier(build_fn=build_nn, # pass but don't call the function!
                           epochs=10,
                           batch_size=128))
])

scores = train_nn_model(cpath, pipeline, PickledReviewsReader, saveto=mpath, cv=4)
for idx, score in enumerate(scores):
    print('Accuracy on slice #{}: {}.'.format((idx+1), score))
print('Model saved to {}.'.format(list(mpath)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy on slice #1: 0.6922435362802335.
Accuracy on slice #2: 0.6894634417570198.
Accuracy on slice #3: 0.699666295884316.
Accuracy on slice #4: 0.696329254727475.
Model saved to ['keras_model', 'sklearn_pipe'].
CPU times: user 50min 10s, sys: 1min 9s, total: 51min 20s
Wall time: 21min 40s
