In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as ss
import nltk
import re
from nltk.util import ngrams
#from word import Word
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
#import spacy
#from spacy import displacy
from collections import Counter
#import en_core_web_sm
import numpy as np
import pickle
import string
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import scipy

#nlp = en_core_web_sm.load()

# Utility Functions

In [None]:
def tokenize_and_remove_stopwords(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    
    return filtered_sentence
            
def buildMRC(fileName):
    words = {}
    with open(fileName) as f:
        lines = f.read().splitlines()
    for i in range(0, len(lines)):
        line = lines[i]
        numScores = 0
        nlet = int(line[0:2])
        nphon = int(line[2:4])
        nsyl = int(line[4])
        kffreq = int(line[5:10])
        kfcats = int(line[10:12])
        kfsamps = int(line[12:15])
        tlfreq = int(line[15:21])
        bfreq = int(line[21:25])
        fam = int(line[25:28])
        conc = int(line[28:31])
        imag = int(line[31:34])
        meanc = int(line[34:37])
        meanp = int(line[37:40])
        aoa = int(line[40:43])
        scores = [nlet, nphon, nsyl, kffreq, kfcats, kfsamps, tlfreq, bfreq, fam,
            conc, imag, meanc, meanp, aoa]
        #Count the number of non-zero scores for this entry
        for score in scores:
            if score != 0:
                numScores += 1
        word = extractWord(lines[i])
        newWord = Word(nlet, nphon, nsyl, kffreq, kfcats, kfsamps,
            tlfreq, bfreq, fam, conc, imag, meanc, meanp, aoa, numScores)

        #Get the current entry in our DB for this word. If the word already
        #exists in our DB, then compare the number of scores on record to our newWord
        #If the newWord has more entries, it is considered to be more "complete"
        #and therefore will replace the current entry
        currentWord = words.get(word)
        if(currentWord is not None):
            if(currentWord.numScores < newWord.numScores):
                words[word] = newWord
        else:
            words[word] = newWord
    return words

def extractWord(line):
    index = 51
    while line[index] != '|':
        index += 1
    return line[51:index]

In [None]:
def get_tfidf_score(corpus,summaries,thresh = 0.4):
    '''
    checks ratio of words with (idf in corpus) > thresh present in the summaries
    '''
    scores=np.zeros(len(summaries))
    vec=TfidfVectorizer(stop_words='english')
    X=vec.fit_transform(corpus)
    X=X>thresh
    imp_words=ss.find(X)[1]
    X2=vec.transform(summaries)
    for row in X2:
        found_words=np.intersect1d(imp_words,ss.find(X2[row])[1])
        scores[row]=float(found_words.shape[0])/imp_words.shape[0]
    return scores

In [26]:
def get_average_idf_values(corpus,summaries):
    corpus_tokenized = tokenize_and_remove_stopwords(corpus.lower())
    idf_values = {}
    
    N = len(corpus_tokenized)
    for word in corpus_tokenized:
        if word not in idf_values:
            idf_values[word] = 1
        else:
            idf_values[word] += 1
    
    summaries_tokenized = []
    
    for summary in summaries:
        summary_tokenized = tokenize_and_remove_stopwords(summary.lower())
        summaries_tokenized.append(summary_tokenized)
                
    scores = np.zeros(len(summaries)) 
    for i,summary_tokenized in enumerate(summaries_tokenized):
        num_words = len(summary_tokenized)
        score = 0
        for word in summary_tokenized:
            if word in idf_values:
                score = score + 1.0/idf_values[word]
        if num_words == 0:
            scores[i] = 0
        else:
            scores[i] = score/num_words
    
    return scores

In [None]:
db = buildMRC('./mrc-psycholinguistics/mrc2.dct')
def get_concr_score(summaries):
    scores = np.zeros(len(summaries))
    for i,summary in enumerate(summaries):
        number_of_words = 0
        concreteness_score = 0
        for word in summary:
            word_scores = db.get(word.upper())
            if word_scores is not None:
                number_of_words+=1
                concreteness_score+=word_scores.get_concr_score()
        if number_of_words != 0:  
            scores[i] = (concreteness_score*1.0)/number_of_words
        else:
            scores[i] = 0
        
    return scores

In [None]:
def n_gram_sim(corpus,summaries,n=2):
    corpus = corpus.split()
    scores=np.zeros(len(summaries))
    bigram_corpus=set(ngrams(corpus,n))
    for i,summ in enumerate(summaries):
        summ = summ.split()
        bigram_summ=set(ngrams(summ,n))
        score=float(len(bigram_corpus.intersection(bigram_summ)))
        score/=len(bigram_corpus)
        scores[i] = score
    return scores
## TODO: make it memory efficient

In [20]:

def get_SUMBASIC_score(corpus,summaries):
    
    corpus_tokenized = tokenize_and_remove_stopwords(corpus.lower())
    wordprob={}
    idf_values = {}
    
    N = len(corpus_tokenized)
    for word in corpus_tokenized:
        if word not in wordprob:
            wordprob[word] = 1 / float(N)
        else:
            wordprob[word] += 1 / float(N)
        
        if word not in idf_values:
            idf_values[word] = 1
        else:
            idf_values[word] += 1
    
    summaries_tokenized = []
    
    
    for summary in summaries:
        summary_tokenized = tokenize_and_remove_stopwords(summary.lower())
        summaries_tokenized.append(summary_tokenized)

            
                
    scores = np.zeros(len(summaries)) 
    for i,summary_tokenized in enumerate(summaries_tokenized):
        num_sen = len(re.split(r'[.!?]+', summaries[i]))
        score = 0
        for word in summary_tokenized:
            if word in wordprob:
                score = score + wordprob[word]*(1/idf_values[word])
        scores[i] = score/num_sen
    
    return scores
    

In [None]:
def get_NE_Frequency_Score(corpus,summaries):
    
    doc = nlp(corpus)
    count_corpus = {}
    for X in doc.ents:
        if X.text not in count_corpus:
            count_corpus[X.text] = 1
        else:
            count_corpus[X.text] +=1
    
    scores = []
    for summary in summaries:
        summary_named_entity = nlp(summary)
        count_summary = {}
        score = 0
        for X in summary_named_entity.ents:
            if X.text not in count_summary:
                count_summary[X.text] = 1
            else:
                count_summary[X.text] += 1
        for key in count_summary:
            if key in count_corpus:
                score += count_summary[key] * count_corpus[key]
        scores.append(score)
    return scores
    

In [22]:
def get_lsa_scores(summaries):
    scores=np.zeros(len(summaries))
    for i,summary in enumerate(summaries):
        summary.replace('?','.')
        summary.replace('!','.')
        scores[i] = lsa(summary.split('.'))

    return scores

In [None]:
def get_perplexity_scores(summaries):
    
    scores=np.zeros(len(summaries))
    for i,summary in enumerate(summaries):
        scores[i] = lm.perplexity(summary)

    return scores

In [24]:
def lsa(summary):
    n_components = 1 # assuming 1 'abstract components'
    # find coherence between sentences of the summary
    # send a list of SENTENCES of the summary
    vec = CountVectorizer(stop_words='english')
    lsa = TruncatedSVD(n_components,algorithm='arpack') 
    try:
        X = vec.fit_transform(summary)
    except:
        return 0
    X = np.array(X.toarray(),dtype='f')
    try:
        lsa.fit(X)
    except:
        return 0

    transforms = []
    for sent in X:
        transforms.append(lsa.transform(sent.reshape((1,-1))))

    metric = 0.0
    for i in range(len(transforms)-1):
        running_sum = 0.
        for j in range(n_components):
            running_sum += cosine_similarity(np.array(transforms[i][j]).reshape((1,-1)),np.array(transforms[i+1][j]).reshape((1,-1)))[0][0]
        metric += running_sum/n_components

    metric/=len(summary)
    return metric

In [27]:
output_baseline = pickle.load(open("./Summaries/output_baseline_temp.pkl","rb"),encoding='latin1')
output_graph = pickle.load(open("./Summaries/output_graph_temp.pkl","rb"),encoding='latin1')
output_pointer_gen = pickle.load(open("./Summaries/output_pointer_gen.pkl","rb"))
output_rein_learn = pickle.load(open("./Summaries/output_rein_learn.pkl","rb"))
output_openNMT = pickle.load(open("./Summaries/output_openNMT.pkl","rb"))
output_struct_infused = pickle.load(open("./Summaries/output_struct_infused.pkl","rb"))

concrete_scores = []
n_gram_sim_scores = []
sum_basic_scores = []
NE_frequency_scores = []
pennalized_scores = []
average_idf_values_scores = []
lsa_scores=[]
for key in output_baseline.keys():
    summaries = []
    summaries.append(output_baseline[key][1])
    if key in output_graph:
        if type(output_graph[key][1]) != type([]):
            summaries.append(output_graph[key][1])
        else:
            if len(output_graph[key][1]) == 0:
                summaries.append("")
            else:
                summaries.append(output_graph[key][1][0])
    else:
        summaries.append("")
    if key in output_pointer_gen:
        summaries.append(output_pointer_gen[key][1])
    else:
        summaries.append("")
    if key in output_rein_learn:
        summaries.append(output_rein_learn[key][1])
    else:
        summaries.append("")
    if key in output_openNMT:
        summaries.append(output_openNMT[key][1])
    else:
        summaries.append("")
    if key in output_struct_infused:
        summaries.append(output_struct_infused[key][1])
    else:
        summaries.append("")
    
#     print([type(item) for item in summaries])
#     print("Total number of summaries generated for ",key," is ",len(summaries))
    corpus = [s for s in output_baseline[key][0].splitlines() if s]
    lsa_scores.append(get_lsa_scores(summaries))
print(np.array(lsa_scores).mean(axis=0))
#     print([s for s in output_baseline[key][0].splitlines() if s])
#     print(get_tfidf_score(corpus,summaries))
#     concrete_scores.append(get_concr_score(summaries))
#     n_gram_sim_scores.append(n_gram_sim(output_baseline[key][0],summaries))
#     sum_basic_scores.append(get_SUMBASIC_score(output_baseline[key][0],summaries))
#     NE_frequency_scores.append(get_NE_Frequency_Score(output_baseline[key][0],summaries))
#     pennalized_scores.append(n_gram_sim(output_baseline[key][0],summaries,4))
#     average_idf_values_scores.append(get_average_idf_values(output_baseline[key][0],summaries))

#     break

# concrete_scores = np.array(concrete_scores)
# n_gram_sim_scores = np.array(n_gram_sim_scores)
# sum_basic_scores = np.array(sum_basic_scores)
# NE_frequency_scores = np.array(NE_frequency_scores)
# pennalized_scores = np.array(pennalized_scores)
# average_idf_values_scores = np.array(average_idf_values_scores)

# print(np.mean(concrete_scores,axis=0))
# print(np.mean(n_gram_sim_scores,axis=0))
# print(np.mean(sum_basic_scores,axis=0))
# print(np.mean(NE_frequency_scores,axis=0))
# print(np.mean(pennalized_scores,axis = 0))
# print(np.mean(average_idf_values_scores,axis=0))

  self.explained_variance_ratio_ = exp_var / full_var


[0.55950404 0.41310487 0.38740018 0.35764567 0.27269299 0.19949213]
