In [1]:
import os
import re
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import six.moves.cPickle
from keras.utils.np_utils import to_categorical
from nltk.tokenize import TweetTokenizer
from nltk import Tree
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget

tweet_tokenizer = TweetTokenizer()


negatives = {
    "didn't": "didn_`_t",
    "couldn't": "couldn_`_t",
    "don't": "don_`_t",
    "wouldn't": "wouldn_`_t",
    "doesn't": "doesn_`_t",
    "wasn't": "wasn_`_t",
    "weren't": "weren_`_t",
    "shouldn't":"shouldn_`_t",
    "isn't": "isn_`_t",
    "aren't": "aren_`_t",
}

MODEL_NAME = "model_movie_2017-03-10-09-440.001.hdf5"
TOKENIZER = "movie_2017-03-10-09-44_tokenizer"
MAX_SEQUENCE_LENGTH = 110

tokenizer = six.moves.cPickle.load(open(TOKENIZER, "rb"))
model = load_model(MODEL_NAME)

Using TensorFlow backend.


In [2]:
def preprocess(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    text = ' '.join(tweet_tokenizer.tokenize(text))
    for k, v in negatives.items():
        text = text.replace(k, v)
    return text

In [3]:
def predict(text):
    text = preprocess(text)
    sequences = tokenizer.texts_to_sequences([text])
    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return model.predict_proba(X, verbose=0)[0][1]

In [4]:
text = 'This show is a must have if you enjoy shows with family Michael J fox does a spectacular job playing Alex Keaton and the series finale is great'
predict(text)

0.99685258

In [5]:
# parser http://nlp.stanford.edu:8080/parser/index.jsp
# stanford realization http://nlp.stanford.edu:8080/sentiment/rntnDemo.html

tree = '''(ROOT
  (S
    (NP
      (NP (JJ Good) (NN film))
      (, ,)
      (PP (CC but)
        (NP (PRP I))))
    (VP (MD will) (RB not)
      (VP (VB recommend)
        (NP (PRP it))))))
'''

In [6]:
def generate_tree(x, name):
    
    new_tree = parse_nested(tree)
    update_keys(new_tree)
    tree_str = ''
    a = nested_list_to_tree(new_tree, tree_str)
    cf = CanvasFrame()
    t = Tree.fromstring(a)
    tc = TreeWidget(cf.canvas(),t)
    cf.add_widget(tc,10,10) # (10,10) offsets
    cf.print_to_file('{}.ps'.format(name))
    cf.destroy()
    return 

def parse_nested(text, left=r'[(]', right=r'[)]', sep=r' '):
    pat = r'({}|{}|{})'.format(left, right, sep)
    tokens = re.split(pat, text)    
    stack = [[]]
    for x in tokens:
        if not x or re.match(sep, x): continue
        if re.match(left, x):
            stack[-1].append([])
            stack.append(stack[-1][-1])
        elif re.match(right, x):
            stack.pop()
            if not stack:
                raise ValueError('error: opening bracket is missing')
        else:
            
            temp = x.rstrip().split()
            if len(temp) == 2:
                for i in temp:
                    stack[-1].append(temp[1])
            else:
                for i in temp:
                    stack[-1].append(i)
    if len(stack) > 1:
        print(stack)
        raise ValueError('error: closing bracket is missing')
    return stack.pop()[0]

def update_keys(node):
    if isinstance(node, str): # в листе
        return node
    items = []
    for child in node[1:]:
        items.append(update_keys(child))
    res = ' '.join(items)
    node[0] = predict(res)
    return res

def nested_list_to_tree(nested_list, tree):
    if isinstance(nested_list, str): # в листе
        return tree + str(nested_list)
    tree += "(" + str(nested_list[0]) + ' '
    for child in nested_list[1:]:
        tree = nested_list_to_tree(child, tree)
    return tree + ')'



In [7]:
generate_tree(tree, 'new')


In [35]:
a

'(0.219898 (0.219898 (0.482649 (0.42032 This)(0.558865 movie))(0.271697 (0.451553 cares)(0.146194 (0.507136 about)(0.344677 (0.559251 (0.559251 cleverness))(0.492164 ,)(0.965898 (0.965898 wit))(0.303927 or)(0.694185 (0.396816 (0.389445 any)(0.485143 other)(0.489174 kind))(0.998065 (0.47998 of)(0.997173 (0.857307 intelligent)(0.845813 humor)))))))(0.492164 .)))'