## BLEU (version 1)

In [2]:
import collections

import math


def _get_ngrams(segment, max_order):

    ngram_counts = collections.Counter()

    for order in range(1, max_order + 1):

        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i+order])
            ngram_counts[ngram] += 1

    return ngram_counts





def compute_bleu(reference_corpus, translation_corpus, max_order=2, smooth=False):
    matches_by_order = [0] * max_order
    possible_matches_by_order = [0] * max_order
    reference_length = 0
    translation_length = 0

    for (references, translation) in zip(reference_corpus, translation_corpus):
        reference_length += min(len(r) for r in references)
        translation_length += len(translation)

        merged_ref_ngram_counts = collections.Counter()
        
        for reference in references:
            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)

        #print("ref: ", references,"\n hypho: ", translation, "\n")
        translation_ngram_counts = _get_ngrams(translation, max_order)
        overlap = translation_ngram_counts & merged_ref_ngram_counts
       
        for ngram in overlap:
            matches_by_order[len(ngram)-1] += overlap[ngram]

        for order in range(1, max_order+1):
            possible_matches = len(translation) - order + 1

            if possible_matches > 0:
                possible_matches_by_order[order-1] += possible_matches



    precisions = [0] * max_order

    for i in range(0, max_order):

        if smooth:
            precisions[i] = ((matches_by_order[i] + 1.) /(possible_matches_by_order[i] + 1.))

        else:

            if possible_matches_by_order[i] > 0:
                 precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i])

            else:
                precisions[i] = 0.0

    if min(precisions) > 0:
        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
        geo_mean = math.exp(p_log_sum)

    else:
        geo_mean = 0

    ratio = float(translation_length) / reference_length

    if ratio > 1.0:
        bp = 1.
        
    else:
        bp = math.exp(1 - 1. / ratio)


    bleu = geo_mean * bp

    return (bleu, precisions, bp, ratio, translation_length, reference_length)

### Read the results

In [107]:
with open('a_side.txt') as f:
    lines = f.read()

In [22]:
with open('a_228.txt', encoding="utf8") as f:
    lines = f.readlines()

ff = open("a_228_1.txt", "w", encoding="utf8")

for line in lines:
    if line[:5] != "Movie":
        ff.write(line)
        
ff.close()

In [108]:
from nltk.tokenize import word_tokenize


chats = lines.split("Speaker 1:")
hypothesis = []
references = []

for chat in chats[1:]:
    sentences = chat.split("\n")
    speaker1 = sentences[0]
    model = sentences[1][6:]
    
    
    refs = []
    for ref in sentences[2:]:
        tokens = word_tokenize(ref[10:])
        
        if tokens != [] and len(tokens) > 1:
            refs.append(word_tokenize(ref[10:]))
    
    if refs != []:
        hypothesis.append(word_tokenize(model))
        references.append(refs)

In [109]:
len(references)

4456

In [110]:
len(hypothesis)

4456

### compute_bleu

In [112]:

bleu, precisions, bp, ratio, translation_length, reference_length = compute_bleu(references1, hypothesis1)

In [113]:
print(bleu, precisions, bp, ratio, translation_length, reference_length)

0.1568931348795922 [0.26513035381750466, 0.09284284284284285] 1.0 1.2816229116945108 4296 3352


## BLEU (version 2)

In [20]:
from nltk.translate.bleu_score import sentence_bleu

def blue_score_text(y_actual,y_predicated):
        #check length equal
        assert len(y_actual) ==  len(y_predicated)
        #list of healine .. each headline has words
        no_of_news = len(y_actual)
        blue_score = 0.0
        for i in range(no_of_news-1):
            reference = y_actual[i][0]
            hypothesis = y_predicated[i]
            
            #Avoid ZeroDivisionError in blue score
            #default weights
            weights=(0.25, 0.25, 0.25, 0.25)
            min_len_present = min(len(reference),len(hypothesis))
            if min_len_present==0:
                continue
            if min_len_present<4:
                weights=[1.0/min_len_present,]*min_len_present
   
            blue_score = blue_score + sentence_bleu([reference],hypothesis,weights=weights)
        
        return blue_score/float(no_of_news-1)

In [21]:
print(blue_score_text(references,hypothesis))

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.0861981428323952


In [16]:
bleu, precisions, bp, ratio, translation_length, reference_length = compute_bleu(references, hypothesis)

In [17]:
print(bleu, precisions, bp, ratio, translation_length, reference_length)

0.7453559924999299 [0.9444444444444444, 0.5882352941176471] 1.0 1.125 18 16


In [115]:
#ROUGE SENTENCE with multi references dividing ourselves 
from rouge import Rouge 

# hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he    lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you     saw on cnn student news"

# reference = "this page includes the show transcript use the transcript to help students with reading comprehension and     vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac    her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests     students ' knowledge of even ts in the news"

rouge = Rouge()

# [sentence] [[ref1 ][ref2]]

score_average = 0
score_p_1_avg = 0
score_r_1_avg = 0
score_p_2_avg = 0
score_r_2_avg = 0
score_p_L_avg = 0
score_r_L_avg = 0

score_f_1_avg = 0
score_f_2_avg = 0
score_f_L_avg = 0

score=0
for i in range(0, len(hypothesis)):
    hypo = hypothesis[i]
    ref_list = references[i]
    score_p_1 = 0
    score_r_1 = 0
    score_p_2 = 0
    score_r_2 = 0
    score_p_L = 0
    score_r_L = 0

    for ref in ref_list:


        ref_sntc = " ".join(ref)
        hypo_sntc = " ".join(hypo)

        res = rouge.get_scores(hypo_sntc, ref_sntc)
        
        score_p_1 = max(score_p_1, res[0]['rouge-1']['p'])
        score_r_1 = max(score_r_1, res[0]['rouge-1']['r'])
        score_p_2 = max(score_p_2, res[0]['rouge-2']['p'])
        score_r_2 = max(score_r_1, res[0]['rouge-2']['r'])
        score_p_L = max(score_p_L, res[0]['rouge-l']['p'])
        score_r_L = max(score_r_L, res[0]['rouge-l']['r'])
        
        score_f_1 = max(score_p_1, res[0]['rouge-1']['f'])
        score_f_2 = max(score_p_2, res[0]['rouge-2']['f'])
        score_f_L = max(score_p_L, res[0]['rouge-l']['f'])
        
        
    score_p_1_avg += score_p_1
    score_r_1_avg += score_r_1
    score_p_2_avg += score_p_2
    score_r_2_avg += score_r_2
    score_p_L_avg += score_p_L
    score_r_L_avg += score_p_L
    
    score_f_1_avg += score_f_1
    score_f_2_avg += score_f_2
    score_f_L_avg += score_f_L
    
    score = score+score_p_1
    
score_p_1_avg = score_p_1_avg/(len(hypothesis)-1)
score_r_1_avg = score_r_1_avg/(len(hypothesis)-1)
score_p_2_avg = score_p_2_avg/(len(hypothesis)-1)
score_r_2_avg = score_r_2_avg/(len(hypothesis)-1)
score_p_L_avg = score_p_L_avg/(len(hypothesis)-1)
score_r_L_avg = score_r_L_avg/(len(hypothesis)-1)

score_f_1_avg = score_f_1_avg/(len(hypothesis)-1)
score_f_2_avg = score_f_2_avg/(len(hypothesis)-1)
score_f_L_avg = score_f_L_avg/(len(hypothesis)-1)


print(score_p_1_avg)
print(score_r_1_avg)
print(score_p_2_avg)
print(score_r_2_avg)
print(score_p_L_avg)
print(score_r_L_avg)

print(score_f_1_avg)
print(score_f_2_avg)
print(score_f_L_avg)

0.24434280162267477
0.22114006156039112
0.09784722247358153
0.22114006156039112
0.22895111517545935
0.22895111517545935
0.2515569464643777
0.10366025971348962
0.23247373706329502


In [44]:
score

9065.169803528886