In [1]:
import pandas as pd
import os
from nltk import sent_tokenize, word_tokenize
from nltk.cluster.util import cosine_distance
import re
import numpy as np
from operator import itemgetter
from sklearn.metrics.pairwise import cosine_similarity
    
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re
import string
import networkx as nx
import pickle
from collections import Counter

In [2]:
df = pd.read_csv('article_summaries_noNaN_prep_20000_2.csv', encoding = 'utf-8', escapechar='\\', index_col=0)
df

Unnamed: 0,article,summary_string,summary_list
0,Its official: US President Barack Obama wants ...,Syrian official: Obama climbed to the top of t...,['Syrian official: Obama climbed to the top of...
1,(CNN) -- Usain Bolt rounded off the world cham...,Usain Bolt wins third gold of world championsh...,['Usain Bolt wins third gold of world champion...
2,"Kansas City, Missouri (CNN) -- The General Ser...",The employee in agencys Kansas City office is ...,['The employee in agencys Kansas City office i...
3,Los Angeles (CNN) -- A medical doctor in Vanco...,NEW: A Canadian doctor says she was part of a ...,['NEW: A Canadian doctor says she was part of ...
4,(CNN) -- Police arrested another teen Thursday...,Another arrest made in gang rape outside Calif...,['Another arrest made in gang rape outside Cal...
...,...,...,...
19995,(CNN) -- A new Lebanese government was announc...,"A new government is announced, bringing to an ...","['A new government is announced, bringing to a..."
19996,(CNN) -- Jaycee Dugard filed a complaint again...,"NEW: Abuse is due to ""the U.S. parole commissi...","['NEW: Abuse is due to ""the U.S. parole commis..."
19997,(CNN)Sky watchers in western North America are...,The total eclipse will only last 4 minutes and...,['The total eclipse will only last 4 minutes a...
19998,"Tripoli, Libya (CNN) -- A political coalition ...",Libyans voted July 7 for a General National Co...,['Libyans voted July 7 for a General National ...


In [3]:
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
def normalize_whitespace(text):
    """
    Translates multiple whitespace into single space character..
    """
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)


def _replace_whitespace(match):
    text = match.group()

    if "\n" in text or "\r" in text:
        return "\n"

    else:
        return " "

In [4]:
def remove_symbols(comment):

    comment = re.sub(r'\n', ' ', comment)
    comment = re.sub(r'\r', ' ', comment)
    comment = re.sub(r'\$', ' ', comment)
    comment = re.sub(r'\€', ' ', comment)
    comment = comment.replace('(', ' ')
    comment = comment.replace(')', ' ')
    comment = comment.replace('[', ' ')
    comment = comment.replace(']', ' ')
    comment = comment.replace('<', ' ')
    comment = comment.replace('>', ' ')
    comment = comment.replace('-', ' ')
    comment = comment.encode('ascii', 'ignore').decode('ascii') #remove non-ascii char
    return comment

In [5]:
def simple_preprocess(text):

    text = normalize_whitespace(text)
    text = remove_symbols(text)
    comment_tokens = word_tokenize(text)
    comment_tokens = [token.strip().replace('``', ' ' ) for token in comment_tokens]
    comment= ' '.join(map(str, comment_tokens))
    comment_nowhite = normalize_whitespace(comment)

    return comment_nowhite


In [6]:
stop = stopwords.words('english')
stop = set(stop)
stop.add('"')
stop.add("'")
stop = list(stop)
lemmatizer = WordNetLemmatizer()

def preprocess_string(text):
    text = text.replace('US', 'USA')
    text = normalize_whitespace(text)
    text = remove_symbols(text)
    text = re.sub(r'\d', '', text) #remove numbers
    comment_tokens = word_tokenize(text) #tokenize
    comment_tokens = [token.strip().replace('``', ' ' ) for token in comment_tokens]
    comment_tokens = [token.lower() for token in comment_tokens] #transform in lowercase
    comment_tokens_no_stopwords = [item for item in comment_tokens if item not in stop] #remove stopwords
    comment_lemma = [lemmatizer.lemmatize(item) for item in comment_tokens_no_stopwords] #comment_tokens_no_stopwords
    comment_lemma = [item for item in comment_lemma if item not in string.punctuation] #remove punctuation
    comment_lemma = [normalize_whitespace(item).strip() for item in comment_lemma]
    comment_lemma = ' '.join(map(str, comment_lemma))

    return comment_lemma

In [None]:
texts_processed = df['article'].apply(lambda x: preprocess_string(x))
with open('texts_processed20000.pickle', 'wb') as handle:
    pickle.dump(texts_processed, handle)

In [None]:
df['text_processed'] = texts_processed

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df= 0.5 ,smooth_idf = True)
tfidf_vectorizer_fit = tfidf_vectorizer.fit(df['text_processed']) #fit tfidf vectorizer on processed texts
with open('tfidf_vect_fit_20000.pickle', 'wb') as handle:
    pickle.dump(tfidf_vectorizer_fit, handle)

In [7]:
#with open('tfidf_vect_fit_20000.pickle', 'rb') as handle:
#    tfidf_vectorizer_fit = pickle.load(handle)

In [8]:
def build_similarity_matrix_cosine_tfidf(sentences):

    # Generate the tf-idf vectors for the corpus
    tfidf_matrix = tfidf_vectorizer_fit.transform(sentences) 

    # compute the cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)

    for i in range(len(cosine_sim_matrix)):
        for j in range(len(cosine_sim_matrix)):
            if i == j:
                cosine_sim_matrix[i,j] = 0 # let similarity between the same sentence be 0
    
    return cosine_sim_matrix

In [9]:
def get_top_sentences(text, number=10):
    
    text_ = simple_preprocess(text) 
    sentences = sent_tokenize(text_)

    sentences_preprocess = [preprocess_string(item) for item in sentences] #complete preprocess on every sentence

    sm = build_similarity_matrix_cosine_tfidf(sentences_preprocess)

    nx_graph = nx.from_numpy_array(sm) #build the graph
    pr_vector = nx.pagerank(nx_graph) #apply PageRank
    
    top_sentences_idx = {}
    for i in range(len(sentences)):
        top_sentences_idx[sentences[i]]=i #store original senteces and their order in a dictionary

    top_sentences = []
    if pr_vector is not None:

        sorted_pr = sorted(((pr_vector[i],s) for i,s in enumerate(sentences)), reverse=True) #sort in descending order

        if len(sentences) <= number: #if number of sentences in text is less than 10 (or number set by the user)
            number = len(sentences)

        for epoch in range(number):
            sent = sorted_pr[epoch][1]
            top_sentences.append(sent)

    return top_sentences, top_sentences_idx

In [10]:
def check_get_top_sentences(text, top_sentence_prev, count_dict, number = 10):

    """
    Check if top sentences are almost equal between them  
        (es. ['Phone: xxxx', 'Phone: xxxx', 'Phone: xxxx']) 
    and in that case remove that sentences from text.
    """
    text = simple_preprocess(text)
    freq = list(count_dict.values())
    for el in freq:
        for key in count_dict:
            if count_dict[key] == el:
                if el > sum(freq)/2:
                    text_new = text.replace (key, ' ')
    top_sentences, top_sentences_idx = get_top_sentences(text_new,  number = number)

    return top_sentences, top_sentences_idx


In [11]:
def sentence_similarity_cosine_tfidf (sentence1, sentence2,):

    s1 = tfidf_vectorizer_fit.transform([sentence1])
    s2 = tfidf_vectorizer_fit.transform([sentence2])

    # compute and print the cosine similarity matrix
    cosine_sim = cosine_similarity(s1, s2)
    return cosine_sim

In [12]:
def from_query_to_syn (query):
    query = query.lower()
    query = lemmatizer.lemmatize(query)

    if len(query) > 1: #if query is composed by more than 1 word
        s1_list = query.split(" ")
    else:
        s1_list = [query]

    for w in range(len(s1_list)):  
        for syn in wordnet.synsets(s1_list[w]): # add synonims
            for l in syn.lemmas():
                if l.name() not in s1_list:
                    s1_list.append(l.name())

    s1_to_string = ' '.join(s1_list)

    return s1_to_string

In [13]:
def MMRScore(candidate, summary, lambta=0.5, query = None):
	
	candidate = preprocess_string(candidate)

	if query != None : 
		ref = from_query_to_syn(query) #if there is a query, the reference sentence will be that one
	else:
		ref = preprocess_string(summary[0]) #if there is NOT a query, the reference sentence will be the one with highest importance

	Sim1 = sentence_similarity_cosine_tfidf(candidate, ref) #compute similarity between candidate sentence and the reference one
	l_expr = lambta * Sim1
	value = [float("-inf")]

	for sent in summary:
		sent = preprocess_string(sent)
		Sim2 = sentence_similarity_cosine_tfidf(candidate, sent) #compute similarity between candidate sentence and the one that are already in summary
		value.append(Sim2)

	r_expr = (1-lambta) * max(value)
	MMR_SCORE = l_expr - r_expr	

	return MMR_SCORE

In [14]:
def summarize (text, max_words = 100, query = None):

    top_sentences, top_sentences_idx = get_top_sentences(text)

    #build dictionary for 'check_get_top_sentences'
    top_sentences_no_num = [re.sub(r'\d', '', sentence) for sentence in top_sentences]
    count_dict = pd.DataFrame(top_sentences_no_num, columns=["x"]).groupby('x').size().to_dict()
    flag = any(l > sum(list(count_dict.values()))/2 for l in list(count_dict.values()))
    if flag == True:
        top_sentences, top_sentences_idx = check_get_top_sentences(text, top_sentences, count_dict)
  

    summary =[]
    list_idx = []

    # if there is a query, first sentence will be the one with highest similarity with the query
    if query != None:
        max_sim = -999
        i=0
        for candidate_first in top_sentences:
            candidate_prep = preprocess_string(candidate_first)
            Sim1 = sentence_similarity_cosine_tfidf(candidate_prep, from_query_to_syn(query))
            if Sim1 > max_sim:
                max_sim = Sim1
                first_sentence = candidate_first
            i+=1
    # if there is NOT a query, first sentence will be the one with highest score importance
    else:
        first_sentence = top_sentences[0]
    
    summary.append(first_sentence)
    pair = (first_sentence, top_sentences_idx[first_sentence])
    list_idx.append(pair)


    sum_words = len(re.findall(r'\w+', first_sentence)) #count number of words in the summary
    

    while sum_words < max_words:
        max_mmr = -9999
        for candidate in top_sentences: #for every sentence, compute MMRScore and insert in the summary the one with highest mmrscore
            if candidate not in summary:
                candidate_prep = preprocess_string(candidate)
                mmr = MMRScore(candidate_prep, summary, query=query)
                if mmr > max_mmr:
                    candidate_sentence = candidate
                    max_mmr = mmr

        if candidate_sentence in summary:
            break
        
        pair = (candidate_sentence, top_sentences_idx[candidate_sentence])
        list_idx.append(pair)

        summary.append(candidate_sentence)

        num_words = len(re.findall(r'\w+', candidate_sentence))
        sum_words += num_words


    sorted_list_idx = sorted(list_idx, key=itemgetter(1))
    summary_sorted = [pair[0] for pair in sorted_list_idx] #sort summary

    return summary, summary_sorted

In [None]:
df_prova_summ = pd.DataFrame()
summary_list = []
text_list = []
summ_gt_list = []
summ_gt_list_list = []

problem_list = []

for i in range(0,12000):
    
    print(i)
    try:
        text = df.iloc[i,0] #article
        summ_gt = df.iloc[i,1] #summary_proposed
        summ_gt_lista = df.iloc[i,2]
        _,sorted_summary = summarize(text, 75)

        sorted_summary = ' '.join(map(str, sorted_summary))

        text_list.append(text)
        summ_gt_list.append(summ_gt)
        summary_list.append(sorted_summary)
        summ_gt_list_list.append(summ_gt_lista)
    except:
        problem_list.append(i)
        text_list.append(text)
        summ_gt_list.append(summ_gt)
        summary_list.append('---')
        summ_gt_list_list.append(summ_gt_lista)

    if i % 1500 == 0: #save every 1500 iterations
        with open('summary_list_12000_final2_nx_{}.pickle'.format(i), 'wb') as handle:
            pickle.dump(summary_list, handle)
    

df_prova_summ['article'] = text_list
df_prova_summ['summary_gt'] = summ_gt_list
df_prova_summ['summary_gt_list'] = summ_gt_list_list
df_prova_summ['summary'] = summary_list

with open('df_summary_12000__final2_nx.pickle', 'wb') as handle:
    pickle.dump(df_prova_summ, handle)
with open('problem_list__final2_nx.pickle', 'wb') as handle:
    pickle.dump(problem_list, handle)
df_prova_summ.to_csv('df_summary_12000__final2_nx.csv')

## See how it works!

In [17]:
df.iloc[3,0]

'Los Angeles (CNN) -- A medical doctor in Vancouver, British Columbia, said Thursday that California arson suspect Harry Burkhart suffered from severe mental illness in 2010, when she examined him as part of a team of doctors.   Dr.  Blaga Stancheva, a family physician and specialist in obstetrics, said both Burkhart and his mother, Dorothee, were her patients in Vancouver while both were applying for refugee status in Canada.   "I was asked to diagnose and treat Harry to support a claim explaining why he was unable to show up in a small-claims court case," Stancheva told CNN in a phone interview.  She declined to cite the case or Burkharts role in it.   Stancheva said she and other doctors including a psychiatrist diagnosed Burkhart with "autism, severe anxiety, post-traumatic stress disorder and depression. " The diagnosis was spelled out in a letter she wrote for the small-claims court case, Stancheva said.   Stancheva, citing doctor-patient confidentiality, would not elaborate furt

In [16]:
_, sorted_summary = summarize(df.iloc[3,0])
sorted_summary

['Burkhart , a 24 year old German national , has been charged with 37 counts of arson following a string of 52 fires in Los Angeles .',
 'Stancheva said the refugee applications by Burkhart and his mother were denied by the Canadian government , and she has not seen Burkhart since early March of 2010 .',
 'It was revealed that Burkhart is also under investigation for arson and fraud in relation to a fire in Neukirchen , near Frankfurt , Germany .',
 'The worst arson sprees in the citys history began last Friday morning with a car fire in Hollywood that spread to apartments above a garage , but no new fires have happened since Burkhart was arrested Monday , Los Angeles District Attorney Steve Cooley said .']

In [18]:
_, sorted_summary_arson = summarize(df.iloc[3,0], query = 'arson')
sorted_summary_arson

['Los Angeles CNN A medical doctor in Vancouver , British Columbia , said Thursday that California arson suspect Harry Burkhart suffered from severe mental illness in 2010 , when she examined him as part of a team of doctors .',
 'Burkhart , a 24 year old German national , has been charged with 37 counts of arson following a string of 52 fires in Los Angeles .',
 'It was revealed that Burkhart is also under investigation for arson and fraud in relation to a fire in Neukirchen , near Frankfurt , Germany .',
 'The worst arson sprees in the citys history began last Friday morning with a car fire in Hollywood that spread to apartments above a garage , but no new fires have happened since Burkhart was arrested Monday , Los Angeles District Attorney Steve Cooley said .']

In [20]:
_, sorted_summary_mentaldisorder = summarize(df.iloc[3,0], query = 'mental disorder')
sorted_summary_mentaldisorder

['Los Angeles CNN A medical doctor in Vancouver , British Columbia , said Thursday that California arson suspect Harry Burkhart suffered from severe mental illness in 2010 , when she examined him as part of a team of doctors .',
 'Stancheva said she and other doctors including a psychiatrist diagnosed Burkhart with autism , severe anxiety , post traumatic stress disorder and depression.',
 "Cooley called it almost attempted murder , '' because people were sleeping in apartments above where Burkhart allegedly set cars on fire with incendiary devices placed under their engines .",
 'Dorothee Burkhart was arrested a day before on an international arrest warrant issued by a district court in Frankfurt , Germany , said federal court spokesman Gunther Meilinger .']