In [1]:
positive_words = set()
with open("data/positive-words.txt","r",encoding="ISO-8859-1") as file:
    positive_words = set(map(lambda x: x.strip(), file.readlines())) 
        
negative_words = set()
with open("data/negative-words.txt","r",encoding="ISO-8859-1") as file:
    negative_words = set(map(lambda x: x.strip(), file.readlines())) 

In [2]:
from itertools import islice

import numpy as np
import pandas as pd
import json
import nltk
from nltk.stem import WordNetLemmatizer

In [3]:
def negate_sequence(words):
    negation = False
    delims = "?.,!:;"
    result = []
    for word in words:
        stripped = word.strip(delims)
        negated = stripped
        if negation:
            if stripped=="GOOD":
                negated = "BAD"
            elif stripped=="BAD":
                negated = "GOOD"

        if any(neg in word for neg in ["not", "n't", "no","never"]):
            negation = not negation
        else:
            result.append(negated)

        if any(c in word for c in delims):
            negation = False

    return [x for x in result if len(x)>0]

In [4]:
def lemmatize_stem(text):
    stemmer = nltk.stem.SnowballStemmer('english')
    lemma = WordNetLemmatizer().lemmatize(text, pos='v')
    return lemma 

allowed_pos = set(["NN","NNP","NNS","NNPS"])
allowed_words = set(["GOOD","BAD"])

def replace_sentiments(text):
    results = []
    tokens = nltk.word_tokenize(text)
    for x in tokens:
        x = x.lower()
        if x in positive_words:
            x="GOOD"
        elif x in negative_words:
            x="BAD"
        results.append(x)
    results = negate_sequence(results)
    
    tagged = nltk.pos_tag(results)
    nouns = [word for word,pos in tagged \
            if (pos in allowed_pos or word in allowed_words)]
    results = [lemmatize_stem(x) for x in nouns]
    #results = list(filter(lambda x: len(x)>1 , results))
    
    return " ".join(results)# if len(results)>1 else ""

In [5]:
def filter_sentiment(text):
    if len(text)>1:
        if "GOOD" in text or "BAD" in text:
            return True
    return False

def map_sentiments(data):
    data = json.loads(data)
    text = data["text"].split(".")
    #data["text"] = text
    sentiments = [replace_sentiments(t) for t in text]
    #sentiments = list(filter(filter_sentiment , sentiments))
    ratings = list(map(getTag,sentiments))
    data["text"] = list(zip(text,ratings))
    data["tags"] = ratings
    return data

In [6]:
from collections import Counter
def getTag(text):
    c = Counter(text.split(" "))
    good = c.get("GOOD",0.01)
    bad = c.get("BAD", 0.01)
    rate = good/bad
    if rate<1:
        return 1
    if rate==1:
        return 2
    return 3

In [8]:
file = 'restaurant_review.json'
with open(file,encoding="utf-8") as f:
    data = [next(f).strip() for x in range(1000)]


processed_docs = list(map(map_sentiments, data[:300]))

with open("tags_auto.txt", "a", encoding="utf-8") as o:
    for x in processed_docs:
        o.write(json.dumps({"review_id":x["review_id"],"tags":x["tags"]}) + "\n")