In [4]:
"""
import modules
"""
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import networkx as nx
from networkx.exception import PowerIterationFailedConvergence
from rouge import Rouge

In [5]:
"""
to download english/hindi glove embeddings
"""
# if want to do for other language then its vocab word embeddings will be needed. here is for english
# ! wget http://nlp.stanford.edu/data/glove.6B.zip
# unzip archive file
# ! unzip glove.6B.zip
# ! wget https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/indiccorp/hi.tar.xz
# ! tar -xvf hi.tar.xz

'\nto download english/hindi glove embeddings\n'

In [6]:
"""
FIXED VARIABLES DECLARED
"""
EMBEDDING_SIZE=100 # OPTIONS: 50,100,200,300
SENT_SUMMARY_COUNT=5
SENT_SUMMARY_THERSHOLD=0.35
HI_DATA_PATH='../data/hi-data/test.csv' # BOTH ARTICLES AND SUMMARIES
EN_DATA_PATH_SRC='../data/preprocessed_truncated/test.txt.src.tokenized.fixed.cleaned.final.truncated.txt' # ONLY ARTICLES
EN_DATA_PATH_TGT='../data/preprocessed_truncated/test.txt.tgt.tokenized.fixed.cleaned.final.truncated.txt' # ONLY SUMMARIES
HI_EMBEDDINGS_PATH='../data/glove/hi/hi-d100-glove.txt'
EN_EMBEDDINGS_PATH='../data/glove/en/glove.6B.100d.txt'

In [7]:
def sent_score(a,b):
    """
    implementation of sent score mentioned in the paper
    """
    words_a=a.split(' ')
    words_b=b.split(' ')
    # overlapping unigrams found in both sentences
    count=0
    for i in words_a:
        if i in words_b:
            count+=1

    score=count/(len(words_a)*len(words_b))

    return score

In [8]:
"""
HELPER FUNCTIONS FOR MAIN SUMMARY GENERATION FUNC
"""
def cosine_similarity(a,b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_sent_vector(sentence):
    global glove_embeddings

    if len(sentence)!=0:
        sentence_vect=sum([glove_embeddings.get(word,np.zeros((EMBEDDING_SIZE,))) for word in sentence.split()])/(len(sentence.split())) # +0.001) # 0.001 to make things stable
    else:
        sentence_vect=np.zeros((EMBEDDING_SIZE,))
    return sentence_vect

def load_glove_model(file_path):
    """
    function to load glove embeddings from txt file into a dict
    """
    glove_model = {}
    with open(file_path,'r',encoding="utf-8",errors='ignore') as f:
        try:
            for i,line in enumerate(f):
                # if i%10000==0:
                #     print(i)
                split_line = line.split()
                word = split_line[0]
                embedding = np.array(split_line[-EMBEDDING_SIZE:], dtype=np.float64)
                glove_model[word] = embedding
        except Exception as e:
            print(e)
            print(line,i)
    print(f"total {len(glove_model)} words loaded from glove model, each of size {EMBEDDING_SIZE}.")
    return glove_model

# load glove embeddings for both hindi and english
glove_embeddings=load_glove_model(file_path=EN_EMBEDDINGS_PATH)
hi_glove_embeddings=load_glove_model(file_path=HI_EMBEDDINGS_PATH)
# save all glove embeddings in a single dict
# this takes care of both languages that are present in hindi texts; although it makes overall process a bit slow
glove_embeddings.update(hi_glove_embeddings)

total 400000 words loaded from glove model, each of size 100.
total 1732951 words loaded from glove model, each of size 100.


tuning params
- embedding size
- rather than fixing number of sents to consider in summary, should decide by a thershold value >> possibly different for each article

In [9]:
def generate_summary(article_text):
    """
    function to get summary of article text
    note: article text needs to be pre-processed; and that step only includes: string.lower()
    """
    # pre-process
    article_text=article_text.lower()
    # tokenize into sentences
    sent_tokens=sent_tokenize(article_text)
    # get sent vectores
    sent_vects=[]
    for sent in sent_tokens:
        sent_vects.append(get_sent_vector(sent))
    
    total_sents=len(sent_tokens)
    similarity_mat=np.zeros((total_sents,total_sents))
    for i in range(total_sents):
        for j in range(total_sents):
            # this way diag entries will be zeros
            if i!=j:
                similarity_mat[i][j]=cosine_similarity(sent_vects[i],sent_vects[j])
    
    # create graph from similarity matrix
    network_graph=nx.from_numpy_array(similarity_mat)
    """
    apply pagerank algo to get scores
    """
    # PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')
    # to solve convergence iteration problem can change max_iter=100(default)
    # isConverged=False
    # iter_count=100
    # while(True):
    #     print(iter_count)
    #     if not isConverged:
    #         try:
    #             scores=nx.pagerank_numpy(network_graph, alpha=0.85, max_iter=iter_count)
    #             isConverged=True
    #         except PowerIterationFailedConvergence:
    #             iter_count*=10
    #     else:
    #         break
    scores=nx.pagerank(network_graph, alpha=0.85, max_iter=1000)
    sorted_scores = sorted(((scores[i],sent_token) for i,sent_token in enumerate(sent_tokens)), reverse=True)

    # TODO: implement thershold version
    # in case some article does not have enough sentences
    if total_sents > SENT_SUMMARY_COUNT:
        summary_text=' '.join([sorted_scores[i][1] for i in range(SENT_SUMMARY_COUNT)])
    else:
        summary_text=' '.join([sorted_scores[i][1] for i in range(total_sents)])

    return summary_text


In [10]:
"""
test case for generate_summary func
"""
test_text="national archives     yes , it ’ s that time again , folks . it ’ s the first friday of the month , when for one ever-so-brief moment the interests of wall street , washington and main street are all aligned on one thing : jobs .     a fresh update on the u.s. employment situation for january hits the wires at 8 : 30 a.m. new york time offering one of the most important snapshots on how the economy fared during the previous month . expectations are for 203,000 new jobs to be created , according to economists polled by dow jones newswires , compared to 227,000 jobs added in february . the unemployment rate is expected to hold steady at 8.3 % .     here at marketbeat hq , we ’ ll be offering color commentary before and after the data crosses the wires . feel free to weigh-in yourself , via the comments section . and while you ’ re here , why don ’ t you sign up to  .     enjoy the show . story_separator_special_tag employers pulled back sharply on hiring last month , a reminder that the u.s. economy may not be growing fast enough to sustain robust job growth . the unemployment rate dipped , but mostly because more americans stopped looking for work .     the labor department says the economy added 120,000 jobs in march , down from more than 200,000 in each of the previous three months .     the unemployment rate fell to 8.2 percent , the lowest since january 2009. the rate dropped because fewer people searched for jobs . the official unemployment tally only includes those seeking work .     the economy has added 858,000 jobs since december _ the best four months of hiring in two years . but federal reserve chairman ben bernanke has cautioned that the current hiring pace is unlikely to continue without more consumer spending ."
generate_summary(test_text)

  return pagerank_scipy(


'a fresh update on the u.s. employment situation for january hits the wires at 8 : 30 a.m. new york time offering one of the most important snapshots on how the economy fared during the previous month . it ’ s the first friday of the month , when for one ever-so-brief moment the interests of wall street , washington and main street are all aligned on one thing : jobs . the unemployment rate dipped , but mostly because more americans stopped looking for work . the labor department says the economy added 120,000 jobs in march , down from more than 200,000 in each of the previous three months . the economy has added 858,000 jobs since december _ the best four months of hiring in two years .'

In [11]:
hi_data=pd.read_csv('../data/hi-data/test.csv',lineterminator='\n')
hi_data.drop('headline',axis=1,inplace=True)
hi_data.dropna(inplace=True)
hi_data.reset_index(drop=True,inplace=True)

In [12]:
"""
calculate summaries for hindi test dataset
"""
article_summary_list_hi=[]
bad_articles_hi=[]
for i,line in enumerate(list(hi_data['article'])):
    if i % 100 == 0:
        print(i)
        print("------")
    try:
        article_summary_list_hi.append(generate_summary(line.strip()))
    except PowerIterationFailedConvergence:
        bad_articles_hi.append(i)
        continue

with open(f'../data/hi_article_summary_{EMBEDDING_SIZE}.txt','w') as file:
    file.write('\n'.join(article_summary_list_hi))

0
------


  return pagerank_scipy(
  return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))


100
------


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  err = np.absolute(x - xlast).sum()
  err = np.absolute(x - xlast).sum()


200
------
300
------
400
------
500
------
600
------
700
------
800
------
900
------
1000
------
1100
------
1200
------
1300
------
1400
------
1500
------
1600
------
1700
------
1800
------
1900
------
2000
------
2100
------
2200
------
2300
------
2400
------
2500
------
2600
------
2700
------
2800
------
2900
------
3000
------
3100
------
3200
------
3300
------
3400
------
3500
------
3600
------
3700
------
3800
------
3900
------
4000
------
4100
------
4200
------
4300
------
4400
------
4500
------
4600
------
4700
------
4800
------
4900
------
5000
------
5100
------
5200
------
5300
------
5400
------
5500
------
5600
------
5700
------
5800
------
5900
------
6000
------
6100
------
6200
------
6300
------
6400
------
6500
------
6600
------
6700
------
6800
------
6900
------
7000
------
7100
------
7200
------
7300
------
7400
------
7500
------
7600
------
7700
------
7800
------
7900
------
8000
------
8100
------
8200
------
8300
------
8400
------
8500
------


In [13]:
"""
remove bad articles that does not have sufficient text data 
"""
foo_refs=list(hi_data['summary'])
hi_refs=[]
for i,article in enumerate(foo_refs):
    if i not in bad_articles_hi:
        hi_refs.append(article)

In [14]:
"""
calculate rouge score for hindi test dataset
"""
rouge_=Rouge()
rouge_scores=[]
bad_rouges=[]
for i,_ in enumerate(hi_refs):
    
    if i % 100 == 0:
        print(i)
        print("------")
    if i not in bad_articles_hi:
        try:
            rouge_scores.append(rouge_.get_scores(hyps=article_summary_list_hi[i],refs=hi_refs[i]))
        except RecursionError:
            bad_rouges.append(i)
            continue

0
------
100
------
200
------
300
------
400
------
500
------
600
------
700
------
800
------
900
------
1000
------
1100
------
1200
------
1300
------
1400
------
1500
------
1600
------
1700
------
1800
------
1900
------
2000
------
2100
------
2200
------
2300
------
2400
------
2500
------
2600
------
2700
------
2800
------
2900
------
3000
------
3100
------
3200
------
3300
------
3400
------
3500
------
3600
------
3700
------
3800
------
3900
------
4000
------
4100
------
4200
------
4300
------
4400
------
4500
------
4600
------
4700
------
4800
------
4900
------
5000
------
5100
------
5200
------
5300
------
5400
------
5500
------
5600
------
5700
------
5800
------
5900
------
6000
------
6100
------
6200
------
6300
------
6400
------
6500
------
6600
------
6700
------
6800
------
6900
------
7000
------
7100
------
7200
------
7300
------
7400
------
7500
------
7600
------
7700
------
7800
------
7900
------
8000
------
8100
------
8200
------
8300
------
8400

In [24]:
rouge_1=[]
for score in rouge_scores:
    rouge_1.append(score[0]['rouge-1']['f'])
print('avg f1 score for rouge-1 measure is :',sum(rouge_1)/len(rouge_1))

avg f1 score for rouge-1 measure is : 0.2926229192009566


In [15]:
"""
calculate summaries for english test dataset
"""
article_summary_list=[]
bad_articles=[]
with open(EN_DATA_PATH_SRC,'r') as file:
    for i,line in enumerate(file):
        if i % 100 == 0:
            print(i)
            print("------")
        try:
            article_summary_list.append(generate_summary(line.strip()))
        except PowerIterationFailedConvergence:
            bad_articles.append(i)
            continue

with open(f'../data/en_article_summary_{EMBEDDING_SIZE}.txt','w') as file:
    file.write('\n'.join(article_summary_list))

0
------


  return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))


100
------
200
------
300
------
400
------
500
------
600
------
700
------
800
------
900
------
1000
------
1100
------
1200
------
1300
------
1400
------
1500
------
1600
------
1700
------
1800
------
1900
------
2000
------
2100
------
2200
------
2300
------
2400
------
2500
------
2600
------
2700
------
2800
------
2900
------
3000
------
3100
------
3200
------
3300
------
3400
------
3500
------
3600
------
3700
------
3800
------
3900
------
4000
------
4100
------
4200
------
4300
------
4400
------
4500
------
4600
------
4700
------
4800
------
4900
------
5000
------
5100
------
5200
------
5300
------
5400
------
5500
------
5600
------


In [16]:
ref_summary=[]
with open(EN_DATA_PATH_TGT,'r') as file:
    for i,line in enumerate(file):
        if i not in bad_articles:
            ref_summary.append(line[2:].strip())

In [17]:
# takes about 3 mins
rouge_scores_en=rouge_.get_scores(hyps=article_summary_list,refs=ref_summary,avg=True)

In [18]:
rouge_scores_en

{'rouge-1': {'r': 0.348989091353505,
  'p': 0.4146895458675804,
  'f': 0.3700004523905176},
 'rouge-2': {'r': 0.12301443492634224,
  'p': 0.14943295556465142,
  'f': 0.13036603766465277},
 'rouge-l': {'r': 0.2998443484438995,
  'p': 0.3569296387214579,
  'f': 0.3181192414432147}}