In [39]:
import nltk
import pandas as pd
import numpy as np
import sys
import os

from data_loader import DataLoader

In [None]:
loader = DataLoader(DataLoader.data_path1)
table = loader.load_table()
table.head()

In [4]:
#sentence segmentation
chunk = table['reviewText'][3]

sentences = nltk.sent_tokenize(chunk)
sentences[:3]

['Cat Stevens at his peak with sparse sounds of acoustic guitar and his original band from Tea For The Tillerman.',
 'Good stuff.',
 'Several hits &#34;Moonshadow&#34;, &#34;Peace Train&#34;, &#34;How Can I Tell You&#34;, &#34;If I Laugh&#34;.']

In [5]:
#tokenize
sentence = sentences[0]

tokens = nltk.word_tokenize(sentence)
tokens[:10]

['Cat',
 'Stevens',
 'at',
 'his',
 'peak',
 'with',
 'sparse',
 'sounds',
 'of',
 'acoustic']

In [6]:
#pos tagging
tagged = nltk.pos_tag(tokens)
tagged[:5]

[('Cat', 'NNP'),
 ('Stevens', 'NNP'),
 ('at', 'IN'),
 ('his', 'PRP$'),
 ('peak', 'NN')]

In [78]:

def is_word(str):
    for char in str:
        if not char.isalpha():
            return False
    return True 

def calculate_words(dataset):
    word_table = {}
    total_words = 0
    for sentence in dataset:
        tokens = nltk.word_tokenize(sentence)
        for word in tokens:
            if is_word(word):
                total_words += 1
                if word in word_table.keys():
                    word_table[word] += 1
                else:
                    word_table[word] = 1

    return total_words, word_table

def get_prob_list(sum, word_table):
    prob_table = {}
    for key, val in word_table.items():
        prob_table[key] = val/sum

    return prob_table

def calculate_relative_entropy(prob_table1, prob_table2):

    RE_list1 = []
    RE_list2 = []
    for key, val in prob_table1.items():

        if key in prob_table2.keys():

            relative_entropy1 = val * np.log(val / prob_table2[key])
            RE_list1.append((key, relative_entropy1))

            relative_entropy2 = prob_table2[key] * np.log(prob_table2[key] / val)
            RE_list2.append((key, relative_entropy2))

    RE_list1.sort(key = lambda x:x[1], reverse = True)
    RE_list2.sort(key = lambda x:x[1], reverse = True)

    return RE_list1, RE_list2

In [None]:
#Measure Indicative words
loader = DataLoader(DataLoader.data_path1)
dataset1 = loader.load_review_text()

loader = DataLoader(DataLoader.data_path2)
dataset2 = loader.load_review_text()

In [79]:
sum1, word_table1 = calculate_words(dataset1)
sum2, word_table2 = calculate_words(dataset2)
prob_list1 = get_prob_list(sum1, word_table1)
prob_list2 = get_prob_list(sum2, word_table2)

ind_list1, ind_list2 = calculate_relative_entropy(prob_list1, prob_list2)

In [84]:
ind_list1[:20]

[('book', 0.04466063946225474),
 ('story', 0.02118670124060069),
 ('read', 0.01931629403493282),
 ('I', 0.014790792266521843),
 ('her', 0.011932724944625953),
 ('to', 0.01184530725887092),
 ('author', 0.010299360464959396),
 ('series', 0.009248887366531578),
 ('books', 0.006240926884376884),
 ('character', 0.005800458780082896),
 ('for', 0.005626339939518006),
 ('was', 0.005066617600304119),
 ('she', 0.004969438179202577),
 ('in', 0.004654059640483568),
 ('love', 0.004474370920164867),
 ('and', 0.004173016292200291),
 ('he', 0.003080925092165508),
 ('so', 0.003031492537429425),
 ('enjoyed', 0.0029086398786697548),
 ('plot', 0.002886732524653327)]