In [3]:
# imports
from collections import defaultdict
from nltk import RegexpTokenizer

In [4]:
# init bigram model for positive & negative comments
# init with 0 counts
# dictionaries to hold the bigram frequencies
model_p = defaultdict(lambda:defaultdict(lambda: 0))
model_n = defaultdict(lambda:defaultdict(lambda: 0))

# dictionaries to hold the probabilities for each bigram
prob_model_p = defaultdict(lambda:defaultdict(lambda: 0.))
prob_model_n = defaultdict(lambda:defaultdict(lambda: 0.))

# tokenizer to tokenize the comments & remove stop words
tokenizer = RegexpTokenizer(r"\w+")

# utility functions

# Returns the list of sample lines from csv
def get_dataset(path):
    lines = []
    with open(path) as file:
        lines = file.read().splitlines()
    return lines

# Creates bigrams using the text provided
def make_bigrams(comment):
    tokens = tokenizer.tokenize(comment.lower()) # convert to all lowercase
    return [(tokens[x], tokens[x+1]) for x in range(len(tokens) - 1)]

# utility functions end

In [6]:
# core functions

# calculates the probabilities for each token in model dictionaries
def calculate_probs():
    # positive prob
    for first_token in model_p:
        total_freq = float(sum(model_p[first_token].values()))
        for second_token in model_p[first_token]:
            # assign prob values to the model keys
            prob_model_p[first_token][second_token] = model_p[first_token][second_token] / total_freq  
    # negative prob
    for first_token in model_n:
        total_freq = float(sum(model_n[first_token].values()))
        for second_token in model_n[first_token]:
            # assign prob values to the model keys
            prob_model_n[first_token][second_token] = model_n[first_token][second_token] / total_freq

# makes the positive & negative models by the given csv
def make_models(path):
    #get line seperated samples
    data = get_dataset(path)
    
    for sample in data:
        #split the samples into comment & polarity
        # delimiter - '::'
        [comment, polarity] = sample.split('::')
    
        #1 = positive comments
        if (polarity == '1'):
            bigrams = make_bigrams(comment) # get the list of bigrams from comment
            # count the token frequencies for each bigram
            for bi in bigrams:
                model_p[bi[0]][bi[1]] += 1
                    
        #0 = negative comments
        if (polarity == '0'):
            bigrams = make_bigrams(comment)
            for bi in bigrams:
                model_n[bi[0]][bi[1]] += 1
                
    # finally create the probability models
    calculate_probs()
    

# calculates & displays the polarity of a given comment
def check_sentiment(comment):
    positive_score = 0.0
    negative_score = 0.0
    
    bigrams = make_bigrams(comment)
    for [first_token, second_token] in bigrams:
        positive_score = prob_model_p[first_token][second_token]
        negative_score = prob_model_n[first_token][second_token]
    
    print('Positive: ', positive_score)
    print('Negative: ', negative_score, '\n')
    
    if (positive_score > negative_score):
        print('Sentiment: Positive')
    elif (positive_score < negative_score):
        print('Sentiment: Negative')
    else:
        print('Sentiment: Neutral')

# calculates the perplexity of the model using the goven comment
def perplexity():
    
    # calculate the vocabulary of the corpus
    vocabulary_count = len(model_p) + len(model_n)
    
    for first in model_p:
        vocabulary_count += len(model_p[first])
    for first in model_n:
        vocabulary_count += len(model_n[first])
    
    print('Vocabulary: ', vocabulary_count)
    
    bigram_prob = 1.
    for first in prob_model_p:
        prob_vals = prob_model_p[first].values()
        for prob in prob_vals:
            if prob != 0:
                bigram_prob *= prob
                
    for first in prob_model_n:
        prob_vals = prob_model_n[first].values()
        for prob in prob_vals:
            if prob != 0:
                bigram_prob *= prob
        
    print('Bigram probability: ', bigram_prob, '\n')
    
    # calculate the perplexity
    pp = (1.0 / bigram_prob) ** (1. / vocabulary_count)
    
    print('perplexity: ', pp)

# core functions end

In [11]:
# execution

make_models('political.txt')

# exection end

ValueError: not enough values to unpack (expected 2, got 1)

In [175]:
# test

comment = 'me rata'
check_sentiment(comment)

# test end

Positive:  0.0
Negative:  0.07142857142857142 

Sentiment: Negative


In [176]:
# perplexity

perplexity()

# perplexity end

Vocalbulary:  1925 

1.3508135619079657e-216
perplexity:  1.2946129603986019
