In [1]:
from methods import *
from collections import Counter
import math
import pandas as pd
import nltk
nltk.download('punkt_tab')
from nltk import word_tokenize

[nltk_data] Downloading package tagsets_json to
[nltk_data]     /Users/tscreven/nltk_data...
[nltk_data]   Package tagsets_json is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tscreven/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
BASE_NAMES = basenames()
VOTE_THRESHOLD = 1
SPLIT = 0.8

In [3]:
def get_parameters(filename):
    text, labels = get_data(filename, VOTE_THRESHOLD)
    train_text, train_labels, test_text, test_labels = split_data(text, labels, SPLIT)

    prob_helpful = sum(train_labels) / len(train_labels) # positive class labeled as 1
    prob_unhelpful = 1 - prob_helpful
    helpful_fd = nltk.FreqDist()
    unhelpful_fd = nltk.FreqDist()
    for review_text, helpful in zip(train_text, train_labels):
        tokens = word_tokenize(review_text)
        if helpful: helpful_fd.update(tokens)
        else: unhelpful_fd.update(tokens)
    return helpful_fd, unhelpful_fd, prob_helpful, prob_unhelpful, test_text, test_labels, len(train_text), len(test_text)

In [4]:
def class_log_probability(prior_prob, fd, class_num_words, review_tokens, total_vocab_size):
    terms = [math.log(prior_prob)]
    for token in review_tokens:
        prob = math.log((fd[token] + 1) / (class_num_words + total_vocab_size))
        terms.append(prob)
    return sum(terms) # Python sum() over a list is more accurate than incremently adding.

In [5]:
def normalize_log_probabilities(helpful_log_prob, unhelpful_log_prob):
    helpful_log_prob -= unhelpful_log_prob
    try:
        helpful_prob = math.exp(helpful_log_prob)
    except:
        return 1.0
    return helpful_prob / (helpful_prob + 1)

In [6]:
def run(filename):
    helpful_fd, unhelpful_fd, prob_helpful, prob_unhelpful, test_text, test_labels, train_length, test_length = get_parameters(filename)
    vocabulary = set(helpful_fd.keys()).union(set(unhelpful_fd.keys()))
    total_vocab_size = len(vocabulary)
    num_correct = 0
    results = []
    helpful_class_size = sum(helpful_fd.values())
    unhelpful_class_size = sum(unhelpful_fd.values())
    for rev_text, label in zip(test_text, test_labels):
        tokens = word_tokenize(rev_text)
        unnorm_helpful_prob = class_log_probability(prob_helpful, helpful_fd, helpful_class_size, tokens, total_vocab_size)
        unnorm_unhelpful_prob = class_log_probability(prob_unhelpful, unhelpful_fd, unhelpful_class_size, tokens, total_vocab_size)
        helpful_prob = normalize_log_probabilities(unnorm_helpful_prob, unnorm_unhelpful_prob)
        prediction = unnorm_helpful_prob >= unnorm_unhelpful_prob
        results.append((helpful_prob, prediction, label))
        if unnorm_helpful_prob >= unnorm_unhelpful_prob and label == 1:
            num_correct += 1
        elif unnorm_unhelpful_prob > unnorm_helpful_prob and label == 0:
            num_correct += 1

    results = sorted(results, key=lambda x: x[0], reverse=True)
    top_size = int(len(results) * 0.05)
    top_percent = results[:top_size]
    true_positives = sum(1 for _, _, label in top_percent if label == 1)
    precision_top_5_percent = true_positives / top_size
    return precision_top_5_percent, prob_helpful, train_length, test_length

In [None]:
precisions = []
prior_helpful_probs = []
train_sizes = []
test_sizes = []

for base in BASE_NAMES:
    prec, prob_helpful, train_len, test_len = run(get_decimated_name(base))
    print(prec)
    precisions.append(prec)
    prior_helpful_probs.append(prob_helpful)
    train_sizes.append(train_len)
    test_sizes.append(test_len)

delta = [precisions[i] - prior_helpful_probs[i] for i in range(len(BASE_NAMES))]
ratio = [precisions[i] / prior_helpful_probs[i] for i in range(len(BASE_NAMES))]


0.9666666666666667
0.6904761904761905
0.6187563710499491
0.4834503510531595
0.6586586586586587
0.8176352705410822
0.4164164164164164
0.3863863863863864
0.3569868995633188
0.6176176176176176
0.20689655172413793
0.5256281407035176
0.5205205205205206
0.6090534979423868
0.35035035035035034
0.5181518151815182
0.8260869565217391
0.6766766766766766
